In [1]:
import pandas as pd 
import json
import ast

# ** PLAN:** (do it the old inefficient way...)

* Read in the **SalesInfos** dataframe at whole 
* Read in the **EvaluationInfos** data as chunks, each with 50000 rows 
* Create 10 following variables in SalesInfos dataframe :
  * **evaluationInfo_evaluationYear**
  * **evaluationInfo_lastChange** 
  * **evaluationInfo_propertyValue**
  * **evaluationInfo_landValue**
  * **evaluationInfo_deductionSum**
  * **evaluationInfo_usage** 
  * **evaluationInfo_residentialUnits**
  * **evaluationInfo_propertyValueArea**
  * **evaluationInfo_rebuildYear**
  * **evaluationInfo_areaSize**
* loop over each row in SalesInfos 
  * loop over each chunk 
    * SHORT **EvaluationInfos** Chunk by dates
    * If salesinfo Row has same ID as the row in EvaluationInfos
      * If salesInfo date is > than evaluationInfo date:
        * add the values to new columns in SalesInfo dataframe
        * BREAK (move to next row in SalesInfos)

# ---- read in the data 

In [2]:
path_sales = r'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Creating_main_dataset_for_sales_data\Data_split\11_salesInfos\Ready\Boliga_propertySales_salesInfos_Ready.csv'
data_salesInfo = pd.read_csv(path_sales, encoding = 'utf-8', low_memory=False) #,nrows=1000)
data_salesInfo.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623800 entries, 0 to 2623799
Data columns (total 8 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  2623800 non-null  object 
 1   salesInfo_isSalesValid           2623800 non-null  bool   
 2   salesInfo_handoverCode           2623340 non-null  float64
 3   salesInfo_handoverName           2623340 non-null  object 
 4   salesInfo_deedIssueDate          2623340 non-null  object 
 5   salesInfo_price                  2623340 non-null  float64
 6   salesInfo_recalculationDate      2623340 non-null  object 
 7   salesInfo_rebuildYear            2623340 non-null  float64
dtypes: bool(1), float64(3), object(4)
memory usage: 142.6+ MB


In [None]:
data_salesInfo.drop_duplicates(subset=['RowID_MAIN_boliga_ROW_ID_unitID','salesInfo_deedIssueDate', 'salesInfo_recalculationDate','salesInfo_price']).info()

# --- -CREATE UNIQUE ID column for the sales data

In [3]:
data_salesInfo['unique_sales_ID'] = range(1, len(data_salesInfo) + 1)

In [64]:
data_salesInfo.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623800 entries, 0 to 2623799
Data columns (total 9 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  2623800 non-null  object 
 1   salesInfo_isSalesValid           2623800 non-null  bool   
 2   salesInfo_handoverCode           2623340 non-null  float64
 3   salesInfo_handoverName           2623340 non-null  object 
 4   salesInfo_deedIssueDate          2623340 non-null  object 
 5   salesInfo_price                  2623340 non-null  float64
 6   salesInfo_recalculationDate      2623340 non-null  object 
 7   salesInfo_rebuildYear            2623340 non-null  float64
 8   unique_sales_ID                  2623800 non-null  int64  
dtypes: bool(1), float64(3), int64(1), object(4)
memory usage: 162.6+ MB


# ---- convert salesInfo_deedIssueDate to date object with only day,month and year  


### NOTE !!!!!!!!!!!!!!!! THE SEEMS TO BE AN ERROR IN THE YEAR OF THE DATA YEARS WITH 2000 AND 1999 SEEM TO GET THE YEAR 1899 IN THE salesInfo_deedIssueDate - BETTER TO GET THE YEAR FROM RECALCULATIONdATE

In [4]:
data_salesInfo['salesInfo_deedIssueDate'] = pd.to_datetime(data_salesInfo['salesInfo_deedIssueDate'], format="%Y-%m-%dT%H:%M:%S").dt.date

# -------- convert salesInfo_recalculationDate to date object as well 

In [5]:
data_salesInfo['salesInfo_recalculationDate'] = pd.to_datetime(data_salesInfo['salesInfo_recalculationDate'], format="%Y-%m-%dT%H:%M:%S",errors='coerce').dt.date

# ------- REMOVE NON VALID SALES 

In [67]:
data_salesInfo['salesInfo_isSalesValid'].value_counts()

salesInfo_isSalesValid
True     2623340
False        460
Name: count, dtype: int64

In [6]:
data_salesInfo = data_salesInfo[data_salesInfo['salesInfo_isSalesValid']==True]
data_salesInfo = data_salesInfo.reset_index(drop=True)
data_salesInfo.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623340 entries, 0 to 2623339
Data columns (total 9 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  2623340 non-null  object 
 1   salesInfo_isSalesValid           2623340 non-null  bool   
 2   salesInfo_handoverCode           2623340 non-null  float64
 3   salesInfo_handoverName           2623340 non-null  object 
 4   salesInfo_deedIssueDate          2623340 non-null  object 
 5   salesInfo_price                  2623340 non-null  float64
 6   salesInfo_recalculationDate      2623335 non-null  object 
 7   salesInfo_rebuildYear            2623340 non-null  float64
 8   unique_sales_ID                  2623340 non-null  int64  
dtypes: bool(1), float64(3), int64(1), object(4)
memory usage: 162.6+ MB


# ---- FILL NA values in 'salesInfo_recalculationDate' using values in 'salesInfo_deedIssueDate'

In [69]:
data_salesInfo['salesInfo_recalculationDate'].isna().sum()

5

In [70]:
data_salesInfo[data_salesInfo['salesInfo_recalculationDate'].isna()] 

Unnamed: 0,RowID_MAIN_boliga_ROW_ID_unitID,salesInfo_isSalesValid,salesInfo_handoverCode,salesInfo_handoverName,salesInfo_deedIssueDate,salesInfo_price,salesInfo_recalculationDate,salesInfo_rebuildYear,unique_sales_ID
2053025,ebbd554a-5709-45c5-8d78-ee785246044b,True,1.0,Alm. frit salg,2015-08-11,1655000.0,NaT,2021.0,2053326
2144054,4d4e0902-6ffd-499a-89a1-19e04701a6a1,True,1.0,Alm. frit salg,2013-11-26,1100000.0,NaT,1999.0,2144379
2158433,ca02a81c-9f02-4eac-9b1c-86812d3317ef,True,1.0,Alm. frit salg,2013-08-09,1850000.0,NaT,2007.0,2158759
2461719,fd059f90-e5e2-4700-bd3f-ef424fd1a876,True,2.0,Familieoverdragelse,2013-11-27,1172562.0,NaT,1979.0,2462136
2604468,6f254bc0-6010-48f5-989c-bd2eecb2ef0c,True,3.0,Auktion,2015-09-08,295000.0,NaT,1965.0,2604924


In [7]:
# ------- FILL NA VALUES !
data_salesInfo['salesInfo_recalculationDate'] = data_salesInfo['salesInfo_recalculationDate'].fillna(data_salesInfo['salesInfo_deedIssueDate'])


In [72]:
data_salesInfo['salesInfo_recalculationDate'].isna().sum()

0

In [17]:
data_salesInfo['salesInfo_handoverName'].value_counts()

salesInfo_handoverName
Alm. frit salg         2404797
Familieoverdragelse     120039
Salg i øvrigt            57792
Auktion                  40712
Name: count, dtype: int64

# ---- create sales Year column (based on recalculationDate)

In [8]:
# Extract the year and add it to a new column 'Year'
data_salesInfo['salesInfo_year_of_sale'] = data_salesInfo['salesInfo_recalculationDate'].apply(lambda x: int(x.year) if not pd.isna(x) and hasattr(x, 'year') else x)

In [74]:
data_salesInfo['salesInfo_year_of_sale'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2623340 entries, 0 to 2623339
Series name: salesInfo_year_of_sale
Non-Null Count    Dtype
--------------    -----
2623340 non-null  int64
dtypes: int64(1)
memory usage: 20.0 MB


In [20]:
data_salesInfo.salesInfo_year_of_sale.value_counts()           

salesInfo_year_of_sale
2006    130677
2021    124215
2007    120239
2020    112475
2017    108025
2018    105154
2019    102030
2010     97019
2015     96916
2016     96823
2005     91612
2008     89190
2022     82077
2014     79171
2009     73551
2011     71753
2013     71188
2004     71092
1997     70660
2012     69928
1998     69286
1999     68338
1996     67762
2003     66331
2000     65326
1995     65215
2002     63620
2001     63577
1994     61115
2023     56909
1992     56559
1993     55468
1753        29
1989         2
1987         2
2112         2
1974         1
1980         1
1983         1
1984         1
Name: count, dtype: int64

In [21]:
len(data_salesInfo[data_salesInfo['salesInfo_year_of_sale']>=2006])

1687342

# ----- Create the MERGING by year 
## SO will will merge sales data with the last years evalutaion !  so I will create salesInfo_sale_year_before_merge

In [77]:
data_salesInfo['salesInfo_year_of_sale']-1

0          2005
1          2022
2          2016
3          2006
4          2022
           ... 
2623335    2008
2623336    2000
2623337    2005
2623338    2005
2623339    2000
Name: salesInfo_year_of_sale, Length: 2623340, dtype: int64

In [9]:
# Extract the year and add it to a new column 'Year'
data_salesInfo['salesInfo_sale_year_before_merge'] = data_salesInfo['salesInfo_year_of_sale']-1 #minus 1 year

# ---- FILTER SALES DATA: take Only Sales between years 2006-2023, drop the rest 
## (no - i will do this once I have merged everything)

In [22]:
# data_salesInfo = data_salesInfo[data_salesInfo['salesInfo_year_of_sale']>=2006]
# data_salesInfo = data_salesInfo.reset_index(drop=True)
# data_salesInfo.info()

# ---- Create new columns in Salesinfos 

In [None]:
# data_salesInfo['evaluationInfo_evaluationYear'] = None
# data_salesInfo['evaluationInfo_lastChange'] = None
# data_salesInfo['evaluationInfo_propertyValue'] = None
# data_salesInfo['evaluationInfo_landValue'] = None
# data_salesInfo['evaluationInfo_deductionSum'] = None
# data_salesInfo['evaluationInfo_usage'] = None
# data_salesInfo['evaluationInfo_residentialUnits'] = None
# data_salesInfo['evaluationInfo_propertyValueArea'] = None
# data_salesInfo['evaluationInfo_rebuildYear'] = None
# data_salesInfo['evaluationInfo_areaSize'] = None

In [26]:
data_salesInfo.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623340 entries, 0 to 2623339
Data columns (total 10 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  2623340 non-null  object 
 1   salesInfo_isSalesValid           2623340 non-null  bool   
 2   salesInfo_handoverCode           2623340 non-null  float64
 3   salesInfo_handoverName           2623340 non-null  object 
 4   salesInfo_deedIssueDate          2623340 non-null  object 
 5   salesInfo_price                  2623340 non-null  float64
 6   salesInfo_recalculationDate      2623340 non-null  object 
 7   salesInfo_rebuildYear            2623340 non-null  float64
 8   unique_sales_ID                  2623340 non-null  int64  
 9   salesInfo_year_of_sale           2623340 non-null  int64  
dtypes: bool(1), float64(3), int64(2), object(4)
memory usage: 182.6+ MB


# ------ RUN THE PROGRAM  AS WHOLE!

In [10]:
# Your path to the evaluation CSV file
path_evaluation = r'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Creating_main_dataset_for_sales_data\Data_split\12_evalutationInfos\Ready\Boliga_propertySales_evaluationInfo_Ready.csv'

# Initialize an empty DataFrame to store the merged data
merged_data_list = []

# Read the evaluation data in chunks
evaluation_WHOLE = pd.read_csv(path_evaluation, low_memory=False )#, chunksize=chunk_size)

chunk_counter = 0

# Convert the 'lastChange' column to datetime and sort the DataFrame
evaluation_WHOLE['lastChange'] = pd.to_datetime(evaluation_WHOLE['lastChange'], format="%Y-%m-%dT%H:%M:%S").dt.date
evaluation_WHOLE = evaluation_WHOLE.sort_values(by='lastChange', ascending=False)

# Merge the two DataFrames based on the common column 'RowID_MAIN_boliga_ROW_ID_unitID'
merged_chunk = pd.merge(data_salesInfo, evaluation_WHOLE, left_on=['RowID_MAIN_boliga_ROW_ID_unitID','salesInfo_sale_year_before_merge'], right_on=['RowID_MAIN_boliga_ROW_ID_unitID', 'evaluationYear'], how='left')

del evaluation_WHOLE

In [11]:
merged_chunk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2665315 entries, 0 to 2665314
Data columns (total 21 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID   object 
 1   salesInfo_isSalesValid            bool   
 2   salesInfo_handoverCode            float64
 3   salesInfo_handoverName            object 
 4   salesInfo_deedIssueDate           object 
 5   salesInfo_price                   float64
 6   salesInfo_recalculationDate       object 
 7   salesInfo_rebuildYear             float64
 8   unique_sales_ID                   int64  
 9   salesInfo_year_of_sale            int64  
 10  salesInfo_sale_year_before_merge  int64  
 11  evaluationYear                    float64
 12  lastChange                        object 
 13  propertyValue                     float64
 14  landValue                         float64
 15  deductionSum                      float64
 16  usage                             ob

In [13]:
pd.set_option('display.max_columns',None)

In [14]:
merged_chunk[merged_chunk.duplicated(subset='unique_sales_ID', keep=False)]

Unnamed: 0,RowID_MAIN_boliga_ROW_ID_unitID,salesInfo_isSalesValid,salesInfo_handoverCode,salesInfo_handoverName,salesInfo_deedIssueDate,salesInfo_price,salesInfo_recalculationDate,salesInfo_rebuildYear,unique_sales_ID,salesInfo_year_of_sale,salesInfo_sale_year_before_merge,evaluationYear,lastChange,propertyValue,landValue,deductionSum,usage,residentialUnits,propertyValueArea,rebuildYear,areaSize
324,a102f33e-affe-4de9-983e-5c4513a4693d,True,3.0,Auktion,2013-09-04,105000.0,2012-09-26,1870.0,325,2012,2011,2011.0,2012-11-20,230000.0,223500.0,0.0,Beboelsesejendom,1.0,230000.0,1870.0,690.0
325,a102f33e-affe-4de9-983e-5c4513a4693d,True,3.0,Auktion,2013-09-04,105000.0,2012-09-26,1870.0,325,2012,2011,2011.0,2011-10-01,580000.0,223500.0,0.0,Beboelsesejendom,1.0,580000.0,1870.0,690.0
405,4a4a8a51-fa2a-4b05-bc93-ab84c1247746,True,1.0,Alm. frit salg,2008-04-21,2400000.0,2008-04-21,1798.0,405,2008,2007,2007.0,2010-11-17,2550000.0,312800.0,0.0,"Ejerlejlighed, beboelse",1.0,2550000.0,1798.0,0.0
406,4a4a8a51-fa2a-4b05-bc93-ab84c1247746,True,1.0,Alm. frit salg,2008-04-21,2400000.0,2008-04-21,1798.0,405,2008,2007,2007.0,2007-10-01,2550000.0,312800.0,0.0,"Ejerlejlighed, beboelse",1.0,2550000.0,1798.0,0.0
635,8b881590-85c2-42dd-a800-272e901d5a2f,True,1.0,Alm. frit salg,2010-05-26,1450000.0,2010-03-22,2017.0,634,2010,2009,2009.0,2010-03-01,1450000.0,375000.0,0.0,Beboelsesejendom,1.0,1450000.0,2017.0,700.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2660364,2aaf9ad3-a556-4751-925c-c5131cef35ef,True,2.0,Familieoverdragelse,2012-06-18,600000.0,2012-03-31,1971.0,2618852,2012,2011,2011.0,2011-10-01,440000.0,108400.0,0.0,Beboelsesejendom,1.0,440000.0,1971.0,868.0
2660553,0521778c-b716-435a-9823-16dae78da039,True,4.0,Salg i øvrigt,2008-03-11,347875.0,2008-03-11,1963.0,2619041,2008,2007,2007.0,2010-02-24,1750000.0,302100.0,0.0,"Ejerlejlighed, beboelse",1.0,1750000.0,1963.0,0.0
2660554,0521778c-b716-435a-9823-16dae78da039,True,4.0,Salg i øvrigt,2008-03-11,347875.0,2008-03-11,1963.0,2619041,2008,2007,2007.0,2007-10-01,1750000.0,302100.0,0.0,"Ejerlejlighed, beboelse",1.0,1750000.0,1963.0,0.0
2662331,7f20afe1-def6-40e7-a796-cbf5852d5cb0,True,3.0,Auktion,2013-03-06,630000.0,2012-10-29,1940.0,2620818,2012,2011,2011.0,2013-01-07,660000.0,338300.0,0.0,Beboelsesejendom,1.0,660000.0,1940.0,553.0


In [15]:
# Sort DataFrame by 'lastChange' in descending order
merged_chunk = merged_chunk.sort_values(by=['unique_sales_ID', 'lastChange'], ascending=[True, False])

# Keep only the first occurrence (latest date) for each 'unique_sales_ID'
merged_chunk_latest = merged_chunk.drop_duplicates(subset='unique_sales_ID', keep='first')
merged_chunk_latest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2623340 entries, 0 to 2665314
Data columns (total 21 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID   object 
 1   salesInfo_isSalesValid            bool   
 2   salesInfo_handoverCode            float64
 3   salesInfo_handoverName            object 
 4   salesInfo_deedIssueDate           object 
 5   salesInfo_price                   float64
 6   salesInfo_recalculationDate       object 
 7   salesInfo_rebuildYear             float64
 8   unique_sales_ID                   int64  
 9   salesInfo_year_of_sale            int64  
 10  salesInfo_sale_year_before_merge  int64  
 11  evaluationYear                    float64
 12  lastChange                        object 
 13  propertyValue                     float64
 14  landValue                         float64
 15  deductionSum                      float64
 16  usage                             object 

## _________________ RENAME COLUMS _____________________

In [18]:
change_column_names = {
    'evaluationYear': 'EvaluationInfo_evaluationYear',
    'lastChange': 'EvaluationInfo_lastChange', 
    'propertyValue': 'EvaluationInfo_propertyValue',
    'landValue': 'EvaluationInfo_landValue',
    'deductionSum': 'EvaluationInfo_deductionSum',
    'usage': 'EvaluationInfo_usage', 
    'residentialUnits': 'EvaluationInfo_residentialUnits',
    'propertyValueArea': 'EvaluationInfo_propertyValueArea',
    'rebuildYear': 'EvaluationInfo_rebuildYear',
    'areaSize': 'EvaluationInfo_areaSize',
}

merged_chunk_latest = merged_chunk_latest.rename(columns=change_column_names)
merged_chunk_latest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2623340 entries, 0 to 2665314
Data columns (total 21 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID   object 
 1   salesInfo_isSalesValid            bool   
 2   salesInfo_handoverCode            float64
 3   salesInfo_handoverName            object 
 4   salesInfo_deedIssueDate           object 
 5   salesInfo_price                   float64
 6   salesInfo_recalculationDate       object 
 7   salesInfo_rebuildYear             float64
 8   unique_sales_ID                   int64  
 9   salesInfo_year_of_sale            int64  
 10  salesInfo_sale_year_before_merge  int64  
 11  EvaluationInfo_evaluationYear     float64
 12  EvaluationInfo_lastChange         object 
 13  EvaluationInfo_propertyValue      float64
 14  EvaluationInfo_landValue          float64
 15  EvaluationInfo_deductionSum       float64
 16  EvaluationInfo_usage              object 

In [19]:
merged_chunk_latest.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2623340 entries, 0 to 2665314
Data columns (total 21 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID   2623340 non-null  object 
 1   salesInfo_isSalesValid            2623340 non-null  bool   
 2   salesInfo_handoverCode            2623340 non-null  float64
 3   salesInfo_handoverName            2623340 non-null  object 
 4   salesInfo_deedIssueDate           2623340 non-null  object 
 5   salesInfo_price                   2623340 non-null  float64
 6   salesInfo_recalculationDate       2623340 non-null  object 
 7   salesInfo_rebuildYear             2623340 non-null  float64
 8   unique_sales_ID                   2623340 non-null  int64  
 9   salesInfo_year_of_sale            2623340 non-null  int64  
 10  salesInfo_sale_year_before_merge  2623340 non-null  int64  
 11  EvaluationInfo_evaluationYear     1364441 

# ______________________________ SAVE RESULT _______________________________________

In [23]:
sub_columns = [
    'RowID_MAIN_boliga_ROW_ID_unitID',
    'salesInfo_isSalesValid',
    'salesInfo_handoverCode',
    'salesInfo_handoverName',
    'salesInfo_deedIssueDate',
    'salesInfo_price',
    'salesInfo_recalculationDate',
    'salesInfo_rebuildYear',
    'unique_sales_ID',
    'salesInfo_year_of_sale',
    'EvaluationInfo_evaluationYear',
    'EvaluationInfo_lastChange',
    'EvaluationInfo_propertyValue',
    'EvaluationInfo_landValue',
    'EvaluationInfo_deductionSum',
    'EvaluationInfo_usage',
    'EvaluationInfo_residentialUnits',
    'EvaluationInfo_propertyValueArea',
    'EvaluationInfo_rebuildYear',
    'EvaluationInfo_areaSize',
]

path=r'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Creating_main_dataset_for_sales_data\Data_merge\step_1_mergin_Sales_transactions_with_evaluation_data\Boliga_salesData_SalesTransactions_with_evaluation_2623340rows_20columns.csv'

merged_chunk_latest[sub_columns].to_csv(path, encoding='utf-8', index=False)

# ------ RUN THE PROGRAM  CHUNKS!

In [36]:
# Your path to the evaluation CSV file
path_evaluation = r'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Creating_main_dataset_for_sales_data\Data_split\12_evalutationInfos\Ready\Boliga_propertySales_evaluationInfo_Ready.csv'

# Define chunk size based on your available memory for DataFrame B
chunk_size = 1000000

# Initialize an empty DataFrame to store the merged data
merged_data_list = []

# Read the evaluation data in chunks
evaluation_chunks = pd.read_csv(path_evaluation, chunksize=chunk_size)

chunk_counter = 0
# Loop through evaluation chunks
for evaluation_chunk in evaluation_chunks:
    chunk_counter+=1
    print(f'-----> chunk nr.{chunk_counter}')
    print(f'chunk lenght: {len(evaluation_chunk)}')
    # Convert the 'lastChange' column to datetime and sort the DataFrame
    evaluation_chunk['lastChange'] = pd.to_datetime(evaluation_chunk['lastChange'], format="%Y-%m-%dT%H:%M:%S").dt.date
    evaluation_chunk = evaluation_chunk.sort_values(by='lastChange', ascending=False)

    # Merge the two DataFrames based on the common column 'RowID_MAIN_boliga_ROW_ID_unitID'
    merged_chunk = pd.merge(data_salesInfo, evaluation_chunk, left_on=['RowID_MAIN_boliga_ROW_ID_unitID'], right_on=['RowID_MAIN_boliga_ROW_ID_unitID'], how='left')
    print(f'lenght of merged: {len(merged_chunk)}')

    

    # Filter rows where sales_date is greater than evaluation_date
    # condition = merged_chunk['salesInfo_deedIssueDate'] > merged_chunk['lastChange']
    # merged_chunk = merged_chunk[condition]

    # display(merged_chunk.info())

    # # Append the filtered chunk to the merged data
    # merged_data = pd.concat([merged_data, merged_chunk])
    merged_data_list.append(merged_chunk)



-----> chunk nr.1
chunk lenght: 1000000
lenght of merged: 5361500
-----> chunk nr.2
chunk lenght: 1000000
lenght of merged: 5416134
-----> chunk nr.3
chunk lenght: 1000000
lenght of merged: 5349221
-----> chunk nr.4
chunk lenght: 1000000
lenght of merged: 5339445
-----> chunk nr.5
chunk lenght: 1000000
lenght of merged: 5241262
-----> chunk nr.6
chunk lenght: 1000000
lenght of merged: 5259839
-----> chunk nr.7
chunk lenght: 1000000
lenght of merged: 5153455
-----> chunk nr.8
chunk lenght: 1000000
lenght of merged: 5094539
-----> chunk nr.9
chunk lenght: 1000000
lenght of merged: 5011781
-----> chunk nr.10
chunk lenght: 1000000
lenght of merged: 4914200
-----> chunk nr.11
chunk lenght: 1000000
lenght of merged: 4861190
-----> chunk nr.12
chunk lenght: 1000000
lenght of merged: 4770107
-----> chunk nr.13
chunk lenght: 1000000
lenght of merged: 4949059
-----> chunk nr.14
chunk lenght: 1000000
lenght of merged: 4886224
-----> chunk nr.15
chunk lenght: 1000000
lenght of merged: 4884168
----

In [37]:
# --- concatenate the list of results 
result_df = pd.concat(merged_data_list, ignore_index=True)
del merged_data_list

In [38]:
result_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80803686 entries, 0 to 80803685
Data columns (total 20 columns):
 #   Column                           Non-Null Count     Dtype  
---  ------                           --------------     -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  80803686 non-null  object 
 1   salesInfo_isSalesValid           80803686 non-null  bool   
 2   salesInfo_handoverCode           80803686 non-null  float64
 3   salesInfo_handoverName           80803686 non-null  object 
 4   salesInfo_deedIssueDate          80803686 non-null  object 
 5   salesInfo_price                  80803686 non-null  float64
 6   salesInfo_recalculationDate      80803686 non-null  object 
 7   salesInfo_rebuildYear            80803686 non-null  float64
 8   unique_sales_ID                  80803686 non-null  int64  
 9   salesInfo_year_of_sale           80803686 non-null  int64  
 10  evaluationYear                   41427174 non-null  float64
 11  lastChange                       41

In [39]:
# Filter rows where sales_date is greater than evaluation_date
condition = result_df['salesInfo_deedIssueDate'] >= result_df['lastChange']
result_df = result_df[condition]

In [55]:
len(result_df)

13428143

In [61]:
a = []

a.extend([1,2])
a

[1, 2]

#### ----- get Unique rows that dont have missing evaluation

In [54]:
# Find duplicated IDs with empty propertyValue and landValue
duplicated_ids_empty_validSale = result_df[result_df.duplicated(subset='RowID_MAIN_boliga_ROW_ID_unitID', keep=False) & result_df['propertyValue'].isna() & result_df['landValue'].isna()]['RowID_MAIN_boliga_ROW_ID_unitID'].unique()
duplicated_ids_empty_validSale

array([], dtype=object)

In [33]:

# Remove rows with duplicated IDs and empty validSale
df_result_clean = result_df[~((result_df['unique_sales_ID'].isin(duplicated_ids_empty_validSale)) & (result_df['propertyValue'].isna()) & (result_df['landValue'].isna()) )]

df_result_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1524698 entries, 2 to 42020213
Data columns (total 20 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  1524698 non-null  object 
 1   salesInfo_isSalesValid           1524698 non-null  bool   
 2   salesInfo_handoverCode           1524698 non-null  float64
 3   salesInfo_handoverName           1524698 non-null  object 
 4   salesInfo_deedIssueDate          1524698 non-null  object 
 5   salesInfo_price                  1524698 non-null  float64
 6   salesInfo_recalculationDate      1524698 non-null  object 
 7   salesInfo_rebuildYear            1524698 non-null  float64
 8   unique_sales_ID                  1524698 non-null  int64  
 9   salesInfo_year_of_sale           1524698 non-null  int64  
 10  evaluationYear                   1524698 non-null  float64
 11  lastChange                       1524698 non-null  obj

In [None]:
# ----Now get the unique 

In [44]:
df_result_clean[df_result_clean['propertyValue'].isna()]

Unnamed: 0,RowID_MAIN_boliga_ROW_ID_unitID,salesInfo_isSalesValid,salesInfo_handoverCode,salesInfo_handoverName,salesInfo_deedIssueDate,salesInfo_price,salesInfo_recalculationDate,salesInfo_rebuildYear,unique_sales_ID,salesInfo_year_of_sale,evaluationYear,lastChange,propertyValue,landValue,deductionSum,usage,residentialUnits,propertyValueArea,rebuildYear,areaSize


In [32]:
df_result_clean_simple = result_df.drop_duplicates(subset=['unique_sales_ID'], keep='first')
df_result_clean_simple.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2623800 entries, 0 to 2625466
Data columns (total 20 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  object 
 1   salesInfo_isSalesValid           bool   
 2   salesInfo_handoverCode           float64
 3   salesInfo_handoverName           object 
 4   salesInfo_deedIssueDate          object 
 5   salesInfo_price                  float64
 6   salesInfo_recalculationDate      object 
 7   salesInfo_rebuildYear            float64
 8   unique_sales_ID                  int64  
 9   salesInfo_year_of_sale           object 
 10  evaluationYear                   float64
 11  lastChange                       object 
 12  propertyValue                    float64
 13  landValue                        float64
 14  deductionSum                     float64
 15  usage                            object 
 16  residentialUnits                 float64
 17  propertyValue

In [39]:
df_result_clean_simple[df_result_clean_simple['unique_sales_ID']==2]

Unnamed: 0,RowID_MAIN_boliga_ROW_ID_unitID,salesInfo_isSalesValid,salesInfo_handoverCode,salesInfo_handoverName,salesInfo_deedIssueDate,salesInfo_price,salesInfo_recalculationDate,salesInfo_rebuildYear,unique_sales_ID,salesInfo_year_of_sale,evaluationYear,lastChange,propertyValue,landValue,deductionSum,usage,residentialUnits,propertyValueArea,rebuildYear,areaSize
1,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2023-01-18,3050000.0,2023-11-08T00:00:00,2008.0,2,2023,,,,,,,,,,


In [36]:
print(len(df_result_clean_simple[df_result_clean_simple['propertyValue'].isna()]))


2562059


In [38]:
df_result_clean_simple[df_result_clean_simple['propertyValue'].isna()]

Unnamed: 0,RowID_MAIN_boliga_ROW_ID_unitID,salesInfo_isSalesValid,salesInfo_handoverCode,salesInfo_handoverName,salesInfo_deedIssueDate,salesInfo_price,salesInfo_recalculationDate,salesInfo_rebuildYear,unique_sales_ID,salesInfo_year_of_sale,evaluationYear,lastChange,propertyValue,landValue,deductionSum,usage,residentialUnits,propertyValueArea,rebuildYear,areaSize
1,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2023-01-18,3050000.0,2023-11-08T00:00:00,2008.0,2,2023,,,,,,,,,,
4,b1a52c68-cc3c-4ee0-b794-d9e638cc26e4,True,1.0,Alm. frit salg,2023-11-07,3900000.0,2023-11-07T00:00:00,1976.0,5,2023,,,,,,,,,,
5,f45b8fc3-af55-4907-b05e-bbc3f98c5935,True,1.0,Alm. frit salg,2023-11-07,1160000.0,2023-11-07T00:00:00,1898.0,6,2023,,,,,,,,,,
7,0d3618c0-35a1-49bc-b2cb-9228ef5065f9,True,1.0,Alm. frit salg,1899-12-30,1115000.0,2000-04-28T00:00:00,1917.0,8,1899,,,,,,,,,,
8,0d3618c0-35a1-49bc-b2cb-9228ef5065f9,True,1.0,Alm. frit salg,1899-12-30,506900.0,1997-01-29T00:00:00,1917.0,9,1899,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625462,845ce2df-d0ca-45cf-b6ca-df6ca620a685,True,2.0,Familieoverdragelse,2009-04-20,797961.0,2009-04-20T00:00:00,1953.0,2623796,2009,,,,,,,,,,
2625463,81ce9112-1089-4179-a38e-e89dd148faac,True,1.0,Alm. frit salg,1899-12-30,6500000.0,2001-06-20T00:00:00,1960.0,2623797,1899,,,,,,,,,,
2625464,81ce9112-1089-4179-a38e-e89dd148faac,True,1.0,Alm. frit salg,2006-01-02,7600000.0,2006-01-02T00:00:00,1960.0,2623798,2006,,,,,,,,,,
2625465,ac22bf6a-bb13-4092-be79-ad8951e0b5c3,True,1.0,Alm. frit salg,2006-05-24,2649900.0,2006-01-02T00:00:00,1963.0,2623799,2006,,,,,,,,,,


In [28]:
result_df[result_df['unique_sales_ID']==1]

Unnamed: 0,RowID_MAIN_boliga_ROW_ID_unitID,salesInfo_isSalesValid,salesInfo_handoverCode,salesInfo_handoverName,salesInfo_deedIssueDate,salesInfo_price,salesInfo_recalculationDate,salesInfo_rebuildYear,unique_sales_ID,salesInfo_year_of_sale,evaluationYear,lastChange,propertyValue,landValue,deductionSum,usage,residentialUnits,propertyValueArea,rebuildYear,areaSize
0,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,2007.0,2007-10-01,1089600.0,1089600.0,330600.0,"Ubebygget areal (Ikke landbrugsareal), forskel...",0.0,1089600.0,2008.0,1124.0
2625467,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
5251298,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
7877082,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
10503627,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
13129901,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
15756299,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
18383375,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
21010565,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,
23637775,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,,,,,,,,,,


In [27]:
result_df[result_df.duplicated(subset=['unique_sales_ID'],keep=False)]


Unnamed: 0,RowID_MAIN_boliga_ROW_ID_unitID,salesInfo_isSalesValid,salesInfo_handoverCode,salesInfo_handoverName,salesInfo_deedIssueDate,salesInfo_price,salesInfo_recalculationDate,salesInfo_rebuildYear,unique_sales_ID,salesInfo_year_of_sale,evaluationYear,lastChange,propertyValue,landValue,deductionSum,usage,residentialUnits,propertyValueArea,rebuildYear,areaSize
0,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-12,14100000.0,2006-11-17T00:00:00,2008.0,1,2007,2007.0,2007-10-01,1089600.0,1089600.0,330600.0,"Ubebygget areal (Ikke landbrugsareal), forskel...",0.0,1089600.0,2008.0,1124.0
1,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2023-01-18,3050000.0,2023-11-08T00:00:00,2008.0,2,2023,,,,,,,,,,
2,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2017-12-11,2910000.0,2017-08-30T00:00:00,2008.0,3,2017,2017.0,2017-10-01,2250000.0,375000.0,161300.0,Beboelsesejendom,1.0,2250000.0,2008.0,562.0
3,9133416f-191b-496e-88bb-62b3a46a370a,True,1.0,Alm. frit salg,2007-09-27,2600000.0,2007-06-14T00:00:00,2008.0,4,2007,2007.0,2007-10-01,1089600.0,1089600.0,330600.0,"Ubebygget areal (Ikke landbrugsareal), forskel...",0.0,1089600.0,2008.0,1124.0
4,b1a52c68-cc3c-4ee0-b794-d9e638cc26e4,True,1.0,Alm. frit salg,2023-11-07,3900000.0,2023-11-07T00:00:00,1976.0,5,2023,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42030517,845ce2df-d0ca-45cf-b6ca-df6ca620a685,True,2.0,Familieoverdragelse,2009-04-20,797961.0,2009-04-20T00:00:00,1953.0,2623796,2009,2009.0,2009-10-01,880000.0,232500.0,0.0,Beboelsesejendom,1.0,880000.0,1953.0,750.0
42030518,81ce9112-1089-4179-a38e-e89dd148faac,True,1.0,Alm. frit salg,1899-12-30,6500000.0,2001-06-20T00:00:00,1960.0,2623797,1899,,,,,,,,,,
42030519,81ce9112-1089-4179-a38e-e89dd148faac,True,1.0,Alm. frit salg,2006-01-02,7600000.0,2006-01-02T00:00:00,1960.0,2623798,2006,2006.0,2006-10-01,7100000.0,538800.0,0.0,Fabrik og lager.,1.0,7100000.0,1960.0,6735.0
42030520,ac22bf6a-bb13-4092-be79-ad8951e0b5c3,True,1.0,Alm. frit salg,2006-05-24,2649900.0,2006-01-02T00:00:00,1963.0,2623799,2006,2006.0,2006-10-01,1250000.0,349200.0,0.0,Beboelsesejendom,1.0,1250000.0,1963.0,646.0


In [None]:
len(data_salesInfo)

In [None]:
len(merged_data_nodobble)

In [None]:
merged_data_nodobble.info()

In [None]:
merged_data_nodobble

In [None]:
merged_data.info()

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

# Example DataFrame A
data_a = {
    'ID': [1, 2, 3, 1, 4],
    'Date_A': [datetime(2022, 1, 1), datetime(2022, 2, 1), datetime(2022, 3, 1), datetime(2022, 4, 1), datetime(2022, 5, 1)],
    'Value_A': [10, 20, 30, 40, 50]
}

df_a = pd.DataFrame(data_a)

# Example DataFrame B
data_b = {
    'ID': [1, 2, 3, 1, 4] * 4,  # Repeated IDs
    'Date_B': [datetime(2021, 12, 1), datetime(2022, 1, 1), datetime(2022, 2, 1), datetime(2022, 3, 1), datetime(2022, 4, 1)] * 4,  # Repeated Dates
    'Value_B': np.arange(1, 21)
}

df_b = pd.DataFrame(data_b)

# Convert Date columns to date format
df_a['Date_A'] = df_a['Date_A'].dt.date
df_b['Date_B'] = df_b['Date_B'].dt.date

# Filter DataFrame B based on the condition
filtered_df_b = df_b[df_b['Date_B'] > df_a['Date_A']]

# Merge DataFrames based on ID
merged_df = pd.merge(df_a, filtered_df_b, how='left', on='ID')

# Display the result
print("DataFrame A:")
print(df_a)
print("\nDataFrame B:")
print(df_b)
print("\nMerged DataFrame:")
print(merged_df)


# --- OLD SHIT 

In [None]:
path_evaluation = r'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Creating_main_dataset_for_sales_data\Data_split\12_evalutationInfos\Ready\Boliga_propertySales_evaluationInfo_Ready.csv'

# Define chunk size based on your available memory for DataFrame B
chunk_size = 10000

# Read DataFrame evaluation as chunks
evaluation_chunks = pd.read_csv(path_evaluation, chunksize=chunk_size)

row_counter = 0
for index, row in data_salesInfo.iterrows():
    row_counter+=1
    print(f'----- row nr. {row_counter} out of 2.623.800')

    # -- get values 
    sales_ID = row['RowID_MAIN_boliga_ROW_ID_unitID']
    sales_date = row['salesInfo_deedIssueDate']

    # trigger to go next row if conditions are met
    new_sales_line = False


    for evaluation_chunk in evaluation_chunks:
        
        # convert the date to day, month and year 
        evaluation_chunk['lastChange'] = pd.to_datetime(evaluation_chunk['lastChange'], format="%Y-%m-%dT%H:%M:%S").dt.date

        # short chunk by "lastChange"
        evaluation_chunk = evaluation_chunk.sort_values(by='lastChange', ascending=False)


        for index_evaluation, row_evaluation in evaluation_chunk.iterrows():
            # -- get values 
            evaluation_ID = row_evaluation['RowID_MAIN_boliga_ROW_ID_unitID']
            evaluation_date = row_evaluation['lastChange']

            if sales_ID == evaluation_ID:
                # print(type(sales_date))
                # print(type(evaluation_date))
                if sales_date>evaluation_date:
                    data_salesInfo.at[index,'evaluationInfo_evaluationYear'] = row_evaluation['evaluationYear']
                    data_salesInfo.at[index,'evaluationInfo_lastChange'] = row_evaluation['lastChange']
                    data_salesInfo.at[index,'evaluationInfo_propertyValue'] = row_evaluation['propertyValue']
                    data_salesInfo.at[index,'evaluationInfo_landValue'] = row_evaluation['landValue']
                    data_salesInfo.at[index,'evaluationInfo_deductionSum'] = row_evaluation['deductionSum']
                    data_salesInfo.at[index,'evaluationInfo_usage'] = row_evaluation['usage']
                    data_salesInfo.at[index,'evaluationInfo_residentialUnits'] = row_evaluation['residentialUnits']
                    data_salesInfo.at[index,'evaluationInfo_propertyValueArea'] = row_evaluation['propertyValueArea']
                    data_salesInfo.at[index,'evaluationInfo_rebuildYear'] = row_evaluation['rebuildYear']
                    data_salesInfo.at[index,'evaluationInfo_areaSize'] = row_evaluation['areaSize']
                    new_sales_line = True
                    break
        
        # if we got the values the move to the next sales row!
        if new_sales_line:
            break

In [None]:
data_salesInfo['RowID_MAIN_boliga_ROW_ID_unitID']

In [None]:
data_salesInfo['salesInfo_deedIssueDate'] = pd.to_datetime(data_salesInfo['salesInfo_deedIssueDate'], format="%Y-%m-%dT%H:%M:%S").dt.date
data_salesInfo['salesInfo_deedIssueDate'][0]

In [None]:
path_evaluation = r'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Creating_main_dataset_for_sales_data\Data_split\12_evalutationInfos\Ready\Boliga_propertySales_evaluationInfo_Ready.csv'

# Define chunk size based on your available memory for DataFrame B
chunk_size = 10000

# Define the columns to be used for merging
merge_columns = ["RowID_MAIN_boliga_ROW_ID_unitID"]

# Read DataFrame B in chunks
date_evaluation_chunks = pd.read_csv(path_evaluation, chunksize=chunk_size)

# Iterate through DataFrame B chunks
for date_evaluation in date_evaluation_chunks:
    
    date_evaluation["lastChange"] = pd.to_datetime(date_evaluation["lastChange"], format="%Y-%m-%dT%H:%M:%S").dt.date

    # Merge based on ID column
    merged_chunk = pd.merge(data_salesInfo, date_evaluation, on=merge_columns, how="left", suffixes=('_a', '_b'))

    # Filter based on the date condition
    filtered_chunk = merged_chunk[(merged_chunk["salesInfo_deedIssueDate"] >= merged_chunk["lastChange"]) | merged_chunk["lastChange"].isnull()]

    # Drop unnecessary columns from DataFrame A
    drop_columns = [col for col in filtered_chunk.columns if col.endswith('_b')]
    filtered_chunk = filtered_chunk.drop(columns=drop_columns)

    # Update DataFrame A with the new values
    data_salesInfo.update(filtered_chunk)

In [None]:
filtered_chunk.info()

In [19]:
path_evaluation = r'D:\Thesis\Properties\Denmark\RE_due_scraping_properties\Boliga_dk\Creating_main_dataset_for_sales_data\Data_split\12_evalutationInfos\Ready\Boliga_propertySales_evaluationInfo_Ready.csv'
# Specify the number of rows you want to read (in this case, 100)
num_rows_to_read = 100

# Replace "your_file.csv" with the actual file path of your CSV file
date_evaluation = pd.read_csv(path_evaluation, nrows=num_rows_to_read)

display(date_evaluation.info())


display(date_evaluation[''])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   RowID_MAIN_boliga_ROW_ID_unitID  100 non-null    object 
 1   evaluationYear                   100 non-null    float64
 2   lastChange                       100 non-null    object 
 3   propertyValue                    100 non-null    float64
 4   landValue                        100 non-null    float64
 5   deductionSum                     100 non-null    float64
 6   usage                            100 non-null    object 
 7   residentialUnits                 100 non-null    float64
 8   propertyValueArea                100 non-null    float64
 9   rebuildYear                      100 non-null    float64
 10  areaSize                         100 non-null    float64
dtypes: float64(8), object(3)
memory usage: 8.7+ KB


None

KeyError: ''

In [None]:
date_evaluation["lastChange"] = pd.to_datetime(date_evaluation["lastChange"], format="%Y-%m-%dT%H:%M:%S").dt.date
date_evaluation["lastChange"][0]

In [None]:
if date_evaluation["lastChange"][0] > data_salesInfo['salesInfo_deedIssueDate'][0]:
    print("bingo")