In [1]:
import pandas as pd
from src.name_scoring import calculate_name_similarity
from src.date_scoring import calculate_date_similarity

In [99]:
df = pd.read_csv('data/docs_similar_people.csv', names=['name_1', 'birth_date_1', 'name_2', 'birth_date_2'])
# df = df.loc[:100 :]

In [100]:
df.head()

Unnamed: 0,name_1,birth_date_1,name_2,birth_date_2
0,ვარდოსანიძე-პაშკევიჩ ანა,1995-12-12,ვარდოსანიძე ანა,1995-12-12
1,ვარდოსანიძე-პაშკევიჩ ანა,1995-12-12,ვარდოსანიძე ანა,1995-12-12
2,ედიბერიძე მარიამ,1994-12-19,ედიბერიძე მარიკა,1994-12-19
3,ჟორჟოლიანი ცირა,1991-12-19,ლაბაძე ცირა,1991-12-19
4,აბესაძე ნინო,2000-07-08,კეკუტია ნინო,2000-07-08


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2996284 entries, 0 to 2996283
Data columns (total 4 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   name_1        object
 1   birth_date_1  object
 2   name_2        object
 3   birth_date_2  object
dtypes: object(4)
memory usage: 91.4+ MB


In [102]:
df.drop_duplicates(subset=['name_1', 'birth_date_1', 'name_2', 'birth_date_2'], inplace=True)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699441 entries, 0 to 2996281
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name_1        699441 non-null  object
 1   birth_date_1  699441 non-null  object
 2   name_2        699441 non-null  object
 3   birth_date_2  699441 non-null  object
dtypes: object(4)
memory usage: 26.7+ MB


In [104]:
def calculate_scores(f):
    name_score = calculate_name_similarity(f['name_1'], f['name_2'])
    year_score, month_score, day_score = calculate_date_similarity(f['birth_date_1'], f['birth_date_2'])
    return [name_score, year_score, month_score, day_score]

In [105]:
scores = df.apply(calculate_scores, axis='columns').apply(pd.Series)
scores

Unnamed: 0,0,1,2,3
0,0.800000,1.0,1.000000,1.000000
2,0.966667,1.0,1.000000,1.000000
3,0.704365,1.0,1.000000,1.000000
4,0.755952,1.0,1.000000,1.000000
6,0.794312,1.0,1.000000,1.000000
...,...,...,...,...
2996251,0.708995,1.0,0.666667,0.666667
2996260,0.940741,1.0,1.000000,1.000000
2996264,0.754960,1.0,1.000000,1.000000
2996265,0.782738,1.0,1.000000,1.000000


In [106]:
scores.rename({0: 'name_score', 1: 'year_score', 2: 'month_score', 3: 'day_score'}, axis='columns', inplace=True)

In [107]:
new_df = pd.concat([df, scores], axis='columns')

In [108]:
new_df['avg_score'] = new_df.apply(lambda f: (f['name_score'] + f['year_score'] + f['month_score'] + f['day_score'])/4, axis='columns')

In [109]:
new_df['bd_score'] = new_df.apply(lambda f: (f['year_score'] + f['month_score'] + f['day_score'])/3, axis='columns')

In [110]:
new_df['score'] = new_df.apply(lambda f: (f['name_score'] + f['bd_score'])/2, axis='columns')

In [112]:
new_df.drop_duplicates(subset=['name_score', 'year_score', 'month_score', 'day_score'], inplace=True)

In [117]:
new_df[(new_df['score'] < 1) & (new_df['score'] > 0.9)].drop(['year_score', 'month_score', 'day_score'], axis='columns')

Unnamed: 0,name_1,birth_date_1,name_2,birth_date_2,name_score,avg_score,bd_score,score
2,ედიბერიძე მარიამ,1994-12-19,ედიბერიძე მარიკა,1994-12-19,0.966667,0.991667,1.0,0.983333
10,ბალახაშვილი თინათინი,1993-06-24,ბალახაშვილი თინიკო,1993-06-24,0.902778,0.975694,1.0,0.951389
18,წიკლაური ბექა,1992-11-09,წიკლაური ბაქა,1992-11-09,0.875000,0.968750,1.0,0.937500
35,ქურცაძე ნათია,1996-06-30,ქურცაძე ნათელა,1996-06-30,0.937778,0.984444,1.0,0.968889
44,მანასიან ნადეჟდა,1992-10-02,მანუკიან ნადეჟდა,1992-10-02,0.922222,0.980556,1.0,0.961111
...,...,...,...,...,...,...,...,...
2859372,მილაძე-ჰაჯიაყაჯანი თამარ,1990-01-03,მილაძე-ჰაჯიაღაჯანი თამარ,1990-01-03,0.985690,0.996423,1.0,0.992845
2881200,ოგანეზოვი აშოტ,1949-10-19,ოგანესოვ აშოტი,1949-10-19,0.957176,0.989294,1.0,0.978588
2910533,სოსიაშვილი-ჩიფჩიოგლუ ჯულეტა,1979-06-08,სოსიაშვილი-ჩიფტჩიოგლუ ჯულეტა,1979-06-08,0.995726,0.998932,1.0,0.997863
2927083,პარწიკანაშვილი აჩი,2019-08-19,რეუცკი აჩი,2019-08-19,0.804464,0.951116,1.0,0.902232


In [90]:
new_df[(new_df['name_score'] < 0.4) & (new_df['name_score'] >= 0.3)].loc[:, ['name_1', 'name_2', 'birth_date_1', 'birth_date_2', 'name_score', 'avg_score', 'score']]

Unnamed: 0,name_1,name_2,birth_date_1,birth_date_2,name_score,avg_score,score
4887,HOVHANNI H AN,HOVHANNISYAN MIHRAN,1978-04-02,1978-04-02,0.391111,0.847778,0.695556
6997,UAZ UAZ,HAMBARDZUMYAN GAGIK,1961-01-25,1961-01-25,0.340741,0.835185,0.67037
12749,WANG YINGCHUAN,ZHANG XU,1982-02-28,1977-05-24,0.391667,0.597917,0.529167
16738,?U?IY?V RUV?HAN,NURIYEV ROVSHAN,1963-07-01,1963-07-01,0.4,0.85,0.7
16739,?U??T?V ?VV??A?,NURIYEV ROVSHAN,1963-07-01,1963-07-01,0.360544,0.840136,0.680272
18979,??V???Y?? MANVEL,GEVORGYAN KANVEL,1969-07-29,1969-07-29,0.355556,0.838889,0.677778
26708,SHA4 4ZY4N,SHAKHBAZYAN GARIK,1983-11-12,1983-11-12,0.332121,0.83303,0.666061
26915,LANGENBERG ELIZABETH,TISSOT VAN PATOT WILHELMINA,1953-06-01,1953-06-01,0.391358,0.84784,0.695679
27388,ASA OV IL,HASANOV HABIL,1972-10-21,1972-10-21,0.32381,0.830952,0.661905
48051,CORUM MERKEZ,YENIGEZER HAKAN,1976-05-02,1976-05-02,0.305556,0.826389,0.652778
