In [92]:
# Imports
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Compare number of duplicates in datasets scraped by Python and Go

In [93]:
# Create dataframes
df_python = pd.read_csv("data/17-04-23.csv")
df_go = pd.read_csv("data/17-04-23_go.csv")

In [94]:
# Dataframes have the same number of records
df_python.id.count() == df_go.id.count()

True

In [95]:
# Count duplicates
p = df_python.duplicated(subset=["id"])
g = df_go.duplicated(subset=["id"])

In [105]:
# Python vs Go
print(f"Out of {df_go.id.count()} records, scraping with Python resulted in {len(df_python[p])} duplicates, and only {len(df_go[g])} when using Go")

Out of 1968 records, scraping with Python resulted in 185 duplicates, and only 46 when using Go


### Minimize the number of duplicates by repeated scraping

In [97]:
# Create dataframes
df_a = pd.read_csv("data/20-04-23_A.csv")
df_b = pd.read_csv("data/20-04-23_B.csv")
df_c = pd.read_csv("data/20-04-23_C.csv")

In [98]:
# Dataframes have the same number of records
df_a.id.count() == df_b.id.count() == df_c.id.count()

True

In [109]:
# Count duplicates
a = df_a.duplicated(subset=["id"])
b = df_b.duplicated(subset=["id"])
c = df_c.duplicated(subset=["id"])
print(f"Duplicated records \nA: {len(df_a[a])} \nB: {len(df_b[b])} \nC: {len(df_c[c])}")

Duplicated records 
A: 410 
B: 141 
C: 20


In [100]:
# Append dataframes
abc = pd.concat([df_a, df_b, df_c], ignore_index=True)

In [101]:
# Drop duplicates
df = abc.drop_duplicates(subset=["id"])

In [107]:
# Missing records in 
print(f"Missing records in combined dataframe: {df_a.id.count() - df.id.count()}\nOriginal # of records: {df_a.id.count()}\nNew number of records: {df.id.count()}")

Missing records in combined dataframe: 2
Original # of records: 2020
New number of records: 2018


In [103]:
# Results
print(f"Worst case scenario - A: {len(df_a[a]) / df_a.id.count() * 100 : .1f}% missing records")
print(f"Middle scenario - B: {len(df_b[b]) / df_b.id.count() * 100 : .1f}% missing records")
print(f"Best case scenario - C: {len(df_c[c]) / df_c.id.count() * 100 : .1f}% missing records")
print(f"Dataset obtained from all above scenarios: {(df_a.id.count() - df.id.count()) / df.id.count() * 100 : .1f}% missing records")

Worst case scenario - A:  20.3% missing records
Middle scenario - B:  7.0% missing records
Best case scenario - C:  1.0% missing records
Dataset obtained from all above scenarios:  0.1% missing records


In [104]:
# Save resulting dataset to .csv file
# df.to_csv("data/20-04-23.csv", index=False)

### Results
By combining data from 3 scrapes we obtained a dataset with only 0.1% of missing records!  
Repeated scraping was done using Go : https://github.com/szymongalecki/boligzonen_scraper

Scrape times:  
- Python:  15 - 30 minutes  
- Go: ~4 minutes