In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from unidecode import unidecode

In [2]:
data_dir = Path("../data/real_estate/ads.csv")
ds_original = pd.read_csv(data_dir)

In [3]:
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
  df = df.drop(["id", "price"], axis=1)
  for column in df.columns:
    df[column] = df[column].apply(lambda s: unidecode(s) if type(s) == str else pd.NA)
  return df

ds = clean_dataset(ds_original)

In [4]:
ds.shape

(1875, 5)

In [7]:
ds_len = ds.shape[0]
percent = 0.5

In [8]:
ds_1_len = int(ds_len * 0.5)
ds_2_len = ds_len - ds_1_len
ds_1_len, ds_2_len

(937, 938)

In [10]:
ds_1_description = ds["description"][:ds_1_len]
ds_1_description

0      PARIS 17eme. AVENUE NIEL *** Video disponible ...
1      Appartement Paris 3 piece(s) 53 m2. Stephane P...
2      Vente Appartement 2 pieces de 31m2 - 75018 Par...
3      A vendre, en exclusivite, dans le 11e arrondis...
4      Gambetta 2 piece(s) 40 m2. Rue d'Annam, a quel...
                             ...                        
932    Appartement Paris 3 piece(s) 50 m2 + 24m2 d'es...
933    Dpt Paris (75), a vendre PARIS 18EME ARRONDISS...
934    Paris Centre - MARAIS - 3 PIECES dans une copr...
935    Appartement Paris 2 piece(s) 35.10 m2. 75018 P...
936    Appartement 4 pieces (115 m2) en vente dans le...
Name: description, Length: 937, dtype: object

In [11]:
ds_2_description = ds["description"][ds_1_len:]
ds_2_description

937     PARIS XIV - ALESIA - APPARTEMENT TRES LUMINEUX...
938     Paris XIVeme Triangle d'OR: 3 pieces - rue du ...
939     Ternes - 3 pieces - 68,8 m2. Exclusivite.<br>S...
940     PARIS 75010, proche de l'hopital Saint-Louis, ...
941     Loft avec atelier dans une ancienne serrurerie...
                              ...                        
1870    Appartement familial - Nation. Le Groupe H&amp...
1871    Village de Passy - 5 pieces - 195,51 m2. Villa...
1872    Appartement 2 pieces de 46m2 | Rue Clisson | P...
1873    Duplex 3 pieces avec terrasse - Pernety XVIeme...
1874    PARIS 18EME -AVE CLICHY-Ideal 1er achat. PARIS...
Name: description, Length: 938, dtype: object

In [14]:
from rapidfuzz import fuzz
import fuzzy
from Levenshtein import distance as levenshtein_distance

class IntelektualnyiModul:
  def __init__(self, soundex: int = 4, threshold: float = 0.75):
    self.soundex = fuzzy.Soundex(soundex)
    self.weights = [
      (self.token_sort_ratio, 0.5),
      (self.soundex_similarity, 0.5),
      (self.damerau_levenshtein_distance, 0.0)
    ]
    
    self.threshold = threshold
    self.decisions = [
      "Yess🎀", "Nooo🚬",
    ]
  
  def soundex_similarity(self, str1, str2) -> int:
    return int(self.soundex(str1) == self.soundex(str2)) * 100
  
  def similarity_weighted(self, str1, str2, jw_: float = None, soundex_: int = None, levenshtein_: int = None) -> float:
    if jw_ is None:
      jw_ = self.token_sort_ratio(str1, str2)
    if soundex_ is None:
      soundex_ = self.soundex_similarity(str1, str2)
    if levenshtein_ is None:
      levenshtein_ = self.damerau_levenshtein_distance(str1, str2)
    
    similarity = (
      jw_ * 0.5 + 
      soundex_ * 0.5 + 
      levenshtein_ * 0.0
    )
    return round(similarity, 2)
  def similarity_weighted_V2(self, str1, str2, weights: list = None) -> float:
    if weights is None:
      weights = self.weights
    similarity = sum([w[0](str1, str2) * w[1] for w in weights])
    return round(similarity, 2) 
  
  def hard_decision(self, similarity, strong: bool = False) -> str:
    result_index = (similarity < self.threshold) if strong else (similarity <= self.threshold)
    return self.decisions[result_index]
  
  @staticmethod
  def damerau_levenshtein_distance(str1, str2) -> int:
    return levenshtein_distance(str1, str2)
  
  @staticmethod
  def token_sort_ratio(str1, str2) -> float:
    return fuzz.token_sort_ratio(str1, str2) / 100

  def compare(self, str1, str2) -> list:
    """
    Returns:
        list: 
          soundex_sim, 
          damerau-levenshtein, 
          token_sort_ratio, 
          similarity_weighted_V2
    """
    res_list = [
      self.soundex_similarity(str1, str2),
      self.damerau_levenshtein_distance(str1, str2),
      self.token_sort_ratio(str1, str2),
      self.similarity_weighted_V2(str1, str2),
    ]
    return [round(res, 2) for res in res_list]
  def compare_row(self, row: pd.Series) -> list:
    str1, str2 = row
    return self.compare(str1, str2)
  
modul = IntelektualnyiModul()

In [15]:
def compare_column(ds: pd.Series) -> list:
  result_dict = {}
  ds_len = ds.shape[0]
  for i in range(ds_len):
    str1 = ds.iloc[i]
    max_str = -1
    for j in range(i + 1, ds_len):
      str2 = ds.iloc[j]
      max_str = max(max_str, len(str2))
      
      row_result = modul.compare(str1, str2)
      result_dict[(i, j)] = row_result
      #if j % 100 == 0:
      #  print(f'\t{j} rows were compared to row: {i}')
    if i % 15 == 0:
      print(f'row: {i}/{ds_len} finished comparing | str1_len: {len(str1)}')
  return result_dict

In [19]:
def compare_different_columns(s1: pd.Series, s2: pd.Series) -> list:
  result_dict = {}
  s1_len = s1.shape[0]
  s2_len = s2.shape[0]

  for i in range(s1_len):
    str1 = s1.iloc[i]
    max_str = -1
    for j in range(s2_len):
      str2 = s2.iloc[j]
      max_str = max(max_str, len(str2))
      
      row_result = modul.compare(str1, str2)
      result_dict[(i, j)] = row_result
      #if j % 100 == 0:
      #  print(f'\t{j} rows were compared to row: {i}')
    if i % 15 == 0:
      print(f'row: {i}/{ds_len} finished comparing | str1_len: {len(str1)}')
  return result_dict

In [None]:
ds_1_1_description_results = None # compare_column(ds_1_description)

save_path = "comparison_1_1.csv"
# ds_1_1_description_results.to_csv(save_path)

In [None]:
ds_2_2_description_results = None # compare_column(ds_2_description)

save_path = "comparison_2_2.csv"
# ds_2_2_description_results.to_csv(save_path)

In [None]:
ds_1_2_description_results = None # compare_different_columns(ds_1_description, ds_2_description)

save_path = "comparison_1_2.csv"
# ds_1_2_description_results.to_csv(save_path)