In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

dating_data_clean = pd.read_csv('dating_data_clean.csv')
dating_data_clean.shape

(8378, 74)

** Changing variables to category type **

In [2]:
dating_data_clean['match'] = dating_data_clean['match'].astype('category')
dating_data_clean['dec'] = dating_data_clean['dec'].astype('category')
dating_data_clean['dec_o'] = dating_data_clean['dec_o'].astype('category')

# Machine Learning

## 1) Predicting based on important attributes at sign up


Each participant ranked the attributes (attractive, sincere, intelligent, ambitious, fun, shared interests) assigning a scale from 1 to 10 based on what is important for them in a partner. The values were normalized to a scale from 0 to 1.
Hypothesis is that the distance between participants in each attribute is a good indicator of a match.

**All 6 attributes**

In [328]:
from sklearn.model_selection import cross_val_score

# Using default values in RandomForest:

X = dating_data_clean[['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha']]
y = dating_data_clean['match']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

rf_all = RandomForestClassifier()

# Fit the random search model
rf_all.fit(X_train, y_train)

# Predicting values:
y_pred_all = rf_all.predict(X_test)


In [329]:
print(confusion_matrix(y_test, y_pred_all))
print(classification_report(y_test, y_pred_all))

[[1369   31]
 [ 255   21]]
             precision    recall  f1-score   support

          0       0.84      0.98      0.91      1400
          1       0.40      0.08      0.13       276

avg / total       0.77      0.83      0.78      1676



In [330]:
accuracy = rf_all.score(X_test, y_test)
accuracy

0.8293556085918854

Low recall: high number of false negatives

**3 attributes**

In [261]:
# Using default values in RandomForest:

X = dating_data_clean[['attr1_1', 'fun1_1', 'shar1_1', 'pf_o_att', 'pf_o_fun', 'pf_o_sha']]
y = dating_data_clean['match']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

rf_3 = RandomForestClassifier()

# Fit the random search model
rf_3.fit(X_train, y_train)

# Predicting values:
y_pred_3 = rf_3.predict(X_test)



In [262]:
print(confusion_matrix(y_test, y_pred_3))
print(classification_report(y_test, y_pred_3))

[[1338   62]
 [ 241   35]]
             precision    recall  f1-score   support

          0       0.85      0.96      0.90      1400
          1       0.36      0.13      0.19       276

avg / total       0.77      0.82      0.78      1676



In [264]:
accuracy = rf_3.score(X_test, y_test)
accuracy

0.81921241050119331

Low precision and low recall: high number of false negatives and the match prediction is also low

## Distance between what's important for both participants

In [354]:
dating_data_clean.columns

Index(['iid', 'gender', 'idg', 'partner', 'pid', 'match', 'pf_o_att',
       'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha', 'dec_o',
       'attr_o', 'sinc_o', 'intel_o', 'fun_o', 'amb_o', 'shar_o', 'like_o',
       'prob_o', 'met_o', 'age', 'field_cd', 'goal', 'date', 'go_out',
       'career_c', 'sports', 'tvsports', 'exercise', 'dining', 'museums',
       'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater',
       'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy', 'expnum',
       'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1',
       'attr3_1', 'sinc3_1', 'fun3_1', 'intel3_1', 'amb3_1', 'dec', 'attr',
       'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob', 'met',
       'match_es', 'attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s',
       'dis_att', 'dis_sinc', 'dis_intel', 'dis_fun', 'dis_amb', 'dis_sha'],
      dtype='object')

In [355]:
dis_att = abs(dating_data_clean['attr1_1'] - dating_data_clean['pf_o_att'])
dis_sinc = abs(dating_data_clean['sinc1_1'] - dating_data_clean['pf_o_sin'])
dis_intel = abs(dating_data_clean['intel1_1'] - dating_data_clean['pf_o_int'])
dis_fun = abs(dating_data_clean['fun1_1'] - dating_data_clean['pf_o_fun'])
dis_amb = abs(dating_data_clean['amb1_1'] - dating_data_clean['pf_o_amb'])
dis_sha = abs(dating_data_clean['shar1_1'] - dating_data_clean['pf_o_sha'])

In [356]:
dating_data_clean['dis_att'] = dis_att
dating_data_clean['dis_sinc'] = dis_sinc
dating_data_clean['dis_intel'] = dis_intel
dating_data_clean['dis_fun'] = dis_fun
dating_data_clean['dis_amb'] = dis_amb
dating_data_clean['dis_sha'] = dis_sha


**Using default parameters**

In [357]:
X = dating_data_clean[['dis_att', 'dis_sinc', 'dis_intel', 'dis_fun', 'dis_amb', 'dis_sha']]
y = dating_data_clean['match']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

rf_dis = RandomForestClassifier()

# Fit the random search model
rf_dis.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [358]:
y_pred_dis = rf_dis.predict(X_test)

In [359]:
print(confusion_matrix(y_test, y_pred_dis))
print(classification_report(y_test, y_pred_dis))

[[1357   43]
 [ 114  162]]
             precision    recall  f1-score   support

          0       0.92      0.97      0.95      1400
          1       0.79      0.59      0.67       276

avg / total       0.90      0.91      0.90      1676



In [360]:
accuracy = rf_dis.score(X_test, y_test)
accuracy

0.90632458233890212

Good improvement in precision and recall. Very good accuracy.

**Using hyperparameter optimization**

In [410]:
rf_dis = RandomForestClassifier(n_estimators=200)

# Fit the model
rf_dis.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [411]:
y_pred_dis = rf_dis.predict(X_test)

In [412]:
print(confusion_matrix(y_test, y_pred_dis))
print(classification_report(y_test, y_pred_dis))

[[1367   33]
 [  90  186]]
             precision    recall  f1-score   support

          0       0.94      0.98      0.96      1400
          1       0.85      0.67      0.75       276

avg / total       0.92      0.93      0.92      1676



In [413]:
accuracy = rf_dis.score(X_test, y_test)
accuracy

0.9266109785202864

Better precision, recall and accuracy.

Question: matrix is better when not splitting into test and training. Since CV is being used, is it acceptable?

**Using RandomSearchCV for n_estimators**

In [414]:
#### Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 100)]

random_grid = {'n_estimators': n_estimators}

rf_all = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf_all, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)




Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=14 .................................................
[CV] n_estimators=14 .................................................
[CV] n_estimators=14 .................................................
[CV] .................................. n_estimators=10, total=   0.1s
[CV] .................................. n_estimators=10, total=   0.1s
[CV] n_estimators=14 .................................................
[CV] .................................. n_estimators=10, total=   0.1s
[CV] n_estimators=14 .................................................
[CV] n_estimat

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s


[CV] .................................. n_estimators=54, total=   0.6s
[CV] n_estimators=54 .................................................
[CV] .................................. n_estimators=44, total=   0.4s
[CV] n_estimators=59 .................................................
[CV] .................................. n_estimators=44, total=   0.5s
[CV] n_estimators=59 .................................................
[CV] .................................. n_estimators=49, total=   0.5s
[CV] n_estimators=64 .................................................
[CV] .................................. n_estimators=49, total=   0.5s
[CV] n_estimators=64 .................................................
[CV] .................................. n_estimators=54, total=   0.5s
[CV] n_estimators=54 .................................................
[CV] .................................. n_estimators=54, total=   0.5s
[CV] n_estimators=69 .................................................
[CV] .

[CV] ................................. n_estimators=104, total=   1.1s
[CV] n_estimators=113 ................................................
[CV] ................................. n_estimators=108, total=   1.2s
[CV] n_estimators=113 ................................................
[CV] ................................. n_estimators=108, total=   1.2s
[CV] n_estimators=113 ................................................
[CV] ................................. n_estimators=108, total=   1.2s
[CV] n_estimators=118 ................................................
[CV] ................................. n_estimators=108, total=   1.2s
[CV] n_estimators=118 ................................................
[CV] ................................. n_estimators=108, total=   1.2s
[CV] n_estimators=118 ................................................
[CV] ................................. n_estimators=113, total=   1.2s
[CV] n_estimators=118 ................................................
[CV] .

[CV] ................................. n_estimators=163, total=   1.4s
[CV] n_estimators=173 ................................................
[CV] ................................. n_estimators=163, total=   1.4s
[CV] n_estimators=173 ................................................
[CV] ................................. n_estimators=163, total=   1.4s
[CV] n_estimators=173 ................................................
[CV] ................................. n_estimators=168, total=   1.5s
[CV] n_estimators=173 ................................................
[CV] ................................. n_estimators=168, total=   1.5s
[CV] n_estimators=173 ................................................
[CV] ................................. n_estimators=168, total=   1.5s
[CV] n_estimators=178 ................................................
[CV] ................................. n_estimators=168, total=   1.4s
[CV] n_estimators=178 ................................................
[CV] .

[Parallel(n_jobs=-1)]: Done 181 tasks      | elapsed:   25.0s


[CV] ................................. n_estimators=188, total=   1.7s
[CV] n_estimators=193 ................................................
[CV] ................................. n_estimators=188, total=   1.6s
[CV] n_estimators=198 ................................................
[CV] ................................. n_estimators=188, total=   1.6s
[CV] n_estimators=198 ................................................
[CV] ................................. n_estimators=188, total=   1.6s
[CV] n_estimators=198 ................................................
[CV] ................................. n_estimators=193, total=   1.6s
[CV] n_estimators=198 ................................................
[CV] ................................. n_estimators=193, total=   1.7s
[CV] n_estimators=198 ................................................
[CV] ................................. n_estimators=193, total=   1.6s
[CV] n_estimators=203 ................................................
[CV] .

[CV] ................................. n_estimators=242, total=   2.8s
[CV] n_estimators=252 ................................................
[CV] ................................. n_estimators=247, total=   2.7s
[CV] n_estimators=252 ................................................
[CV] ................................. n_estimators=247, total=   2.8s
[CV] n_estimators=252 ................................................
[CV] ................................. n_estimators=247, total=   2.8s
[CV] n_estimators=257 ................................................
[CV] ................................. n_estimators=247, total=   2.7s
[CV] n_estimators=257 ................................................
[CV] ................................. n_estimators=247, total=   2.8s
[CV] n_estimators=257 ................................................
[CV] ................................. n_estimators=252, total=   2.8s
[CV] n_estimators=257 ................................................
[CV] .

[CV] ................................. n_estimators=302, total=   2.7s
[CV] n_estimators=311 ................................................
[CV] ................................. n_estimators=302, total=   2.8s
[CV] n_estimators=311 ................................................
[CV] ................................. n_estimators=302, total=   2.9s
[CV] n_estimators=311 ................................................
[CV] ................................. n_estimators=306, total=   3.1s
[CV] n_estimators=311 ................................................
[CV] ................................. n_estimators=306, total=   3.2s
[CV] n_estimators=311 ................................................
[CV] ................................. n_estimators=306, total=   3.1s
[CV] n_estimators=316 ................................................
[CV] ................................. n_estimators=306, total=   3.2s
[CV] n_estimators=316 ................................................
[CV] .

[CV] ................................. n_estimators=361, total=   3.1s
[CV] n_estimators=366 ................................................
[CV] ................................. n_estimators=361, total=   3.2s
[CV] n_estimators=366 ................................................
[CV] ................................. n_estimators=361, total=   3.1s
[CV] n_estimators=371 ................................................
[CV] ................................. n_estimators=361, total=   3.1s
[CV] n_estimators=371 ................................................
[CV] ................................. n_estimators=361, total=   3.1s
[CV] n_estimators=371 ................................................
[CV] ................................. n_estimators=366, total=   3.1s
[CV] n_estimators=371 ................................................
[CV] ................................. n_estimators=366, total=   3.2s
[CV] n_estimators=371 ................................................
[CV] .

[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:  1.7min


[CV] ................................. n_estimators=386, total=   3.3s
[CV] n_estimators=396 ................................................
[CV] ................................. n_estimators=391, total=   3.4s
[CV] n_estimators=396 ................................................
[CV] ................................. n_estimators=391, total=   3.4s
[CV] n_estimators=396 ................................................
[CV] ................................. n_estimators=391, total=   3.3s
[CV] n_estimators=401 ................................................
[CV] ................................. n_estimators=391, total=   3.3s
[CV] n_estimators=401 ................................................
[CV] ................................. n_estimators=391, total=   3.3s
[CV] n_estimators=401 ................................................
[CV] ................................. n_estimators=396, total=   3.4s
[CV] n_estimators=401 ................................................
[CV] .

[CV] ................................. n_estimators=445, total=   4.0s
[CV] n_estimators=455 ................................................
[CV] ................................. n_estimators=445, total=   3.9s
[CV] n_estimators=455 ................................................
[CV] ................................. n_estimators=445, total=   4.0s
[CV] n_estimators=455 ................................................
[CV] ................................. n_estimators=450, total=   4.0s
[CV] n_estimators=455 ................................................
[CV] ................................. n_estimators=450, total=   4.1s
[CV] n_estimators=455 ................................................
[CV] ................................. n_estimators=450, total=   3.9s
[CV] n_estimators=460 ................................................
[CV] ................................. n_estimators=450, total=   3.9s
[CV] n_estimators=460 ................................................
[CV] .

[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.7min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [10, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, 89, 94, 99, 104, 108, 113, 118, 123, 128, 133, 138, 143, 148, 153, 158, 163, 168, 173, 178, 183, 188, 193, 198, 203, 207, 212, 217, 222, 227, 232, 237, 242, 247, 252, 257, 262, 267, 272, 277, 282, 2...405, 410, 415, 420, 425, 430, 435, 440, 445, 450, 455, 460, 465, 470, 475, 480, 485, 490, 495, 500]},
          pre_dispatch='2*n_jobs', ra

In [415]:
rf_random.best_params_

{'n_estimators': 490}

In [416]:
y_pred_dis = rf_random.predict(X_test)
sum(y_pred_dis)

223

In [417]:
print(confusion_matrix(y_test, y_pred_dis))
print(classification_report(y_test, y_pred_dis))

[[1362   38]
 [  91  185]]
             precision    recall  f1-score   support

          0       0.94      0.97      0.95      1400
          1       0.83      0.67      0.74       276

avg / total       0.92      0.92      0.92      1676



**Suggesting participants who should meet based on prediction**

In [301]:
# Creating a dataset with all participants vs all participants:

dating_import = dating_data_clean[['iid', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']].copy()
dating_import = dating_import.drop_duplicates()
dating_import2 = dating_import

In [302]:
dating_import2 = dating_import2.rename(columns={'iid':'iid_b', 'attr1_1':'attr1_1_b', 'sinc1_1':'sinc1_1_b', 'intel1_1':'intel1_1_b', 'fun1_1':'fun1_1_b', 'amb1_1':'amb1_1_b', 'shar1_1':'shar1_1_b'})

In [303]:
dating_import['key'] = 0
dating_import2['key'] = 0

df_cartesian = dating_import.merge(dating_import2, how='outer')
df_cartesian

Unnamed: 0,iid,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,key,iid_b,attr1_1_b,sinc1_1_b,intel1_1_b,fun1_1_b,amb1_1_b,shar1_1_b
0,1,0.15,0.2,0.20,0.15,0.15,0.15,0,1,0.150000,0.200000,0.200000,0.150000,0.150000,0.150000
1,1,0.15,0.2,0.20,0.15,0.15,0.15,0,2,0.450000,0.050000,0.250000,0.200000,0.000000,0.050000
2,1,0.15,0.2,0.20,0.15,0.15,0.15,0,3,0.350000,0.100000,0.350000,0.100000,0.100000,0.000000
3,1,0.15,0.2,0.20,0.15,0.15,0.15,0,4,0.200000,0.200000,0.200000,0.200000,0.100000,0.100000
4,1,0.15,0.2,0.20,0.15,0.15,0.15,0,5,0.200000,0.050000,0.250000,0.250000,0.100000,0.150000
5,1,0.15,0.2,0.20,0.15,0.15,0.15,0,6,0.100000,0.250000,0.200000,0.250000,0.050000,0.150000
6,1,0.15,0.2,0.20,0.15,0.15,0.15,0,7,0.150000,0.150000,0.250000,0.200000,0.150000,0.100000
7,1,0.15,0.2,0.20,0.15,0.15,0.15,0,8,0.090909,0.181818,0.272727,0.181818,0.181818,0.090909
8,1,0.15,0.2,0.20,0.15,0.15,0.15,0,9,0.200000,0.100000,0.200000,0.300000,0.100000,0.100000
9,1,0.15,0.2,0.20,0.15,0.15,0.15,0,10,0.150000,0.150000,0.150000,0.400000,0.100000,0.050000


In [304]:
df_cartesian = df_cartesian.drop((df_cartesian[df_cartesian['iid'] == df_cartesian['iid_b']]).index)

In [305]:
df_cartesian

Unnamed: 0,iid,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,key,iid_b,attr1_1_b,sinc1_1_b,intel1_1_b,fun1_1_b,amb1_1_b,shar1_1_b
1,1,0.15,0.2,0.20,0.15,0.15,0.15,0,2,0.450000,0.050000,0.250000,0.200000,0.000000,0.050000
2,1,0.15,0.2,0.20,0.15,0.15,0.15,0,3,0.350000,0.100000,0.350000,0.100000,0.100000,0.000000
3,1,0.15,0.2,0.20,0.15,0.15,0.15,0,4,0.200000,0.200000,0.200000,0.200000,0.100000,0.100000
4,1,0.15,0.2,0.20,0.15,0.15,0.15,0,5,0.200000,0.050000,0.250000,0.250000,0.100000,0.150000
5,1,0.15,0.2,0.20,0.15,0.15,0.15,0,6,0.100000,0.250000,0.200000,0.250000,0.050000,0.150000
6,1,0.15,0.2,0.20,0.15,0.15,0.15,0,7,0.150000,0.150000,0.250000,0.200000,0.150000,0.100000
7,1,0.15,0.2,0.20,0.15,0.15,0.15,0,8,0.090909,0.181818,0.272727,0.181818,0.181818,0.090909
8,1,0.15,0.2,0.20,0.15,0.15,0.15,0,9,0.200000,0.100000,0.200000,0.300000,0.100000,0.100000
9,1,0.15,0.2,0.20,0.15,0.15,0.15,0,10,0.150000,0.150000,0.150000,0.400000,0.100000,0.050000
10,1,0.15,0.2,0.20,0.15,0.15,0.15,0,11,0.350000,0.200000,0.200000,0.200000,0.000000,0.050000


In [418]:
dis_att = abs(df_cartesian['attr1_1'] - df_cartesian['attr1_1_b'])
dis_sinc = abs(df_cartesian['sinc1_1'] - df_cartesian['sinc1_1_b'])
dis_intel = abs(df_cartesian['intel1_1'] - df_cartesian['intel1_1_b'])
dis_fun = abs(df_cartesian['fun1_1'] - df_cartesian['fun1_1_b'])
dis_amb = abs(df_cartesian['amb1_1'] - df_cartesian['amb1_1_b'])
dis_sha = abs(df_cartesian['shar1_1'] - df_cartesian['shar1_1_b'])

In [419]:
df_cartesian['dis_att'] = dis_att
df_cartesian['dis_sinc'] = dis_sinc
df_cartesian['dis_intel'] = dis_intel
df_cartesian['dis_fun'] = dis_fun
df_cartesian['dis_amb'] = dis_amb
df_cartesian['dis_sha'] = dis_sha


**Prediction using rf_dis**:

In [420]:
X_new = df_cartesian[['dis_att', 'dis_sinc', 'dis_intel', 'dis_fun', 'dis_amb', 'dis_sha']]
y_pred_all = rf_dis.predict(X_new)

In [421]:
sum(y_pred_all)

10474

In [422]:
len(y_pred_all)

303050

In [423]:
sum(y_pred_all) / len(y_pred_all)

0.034561953473024251

**Prediction using rf_random**:

In [424]:
X_new_ran = df_cartesian[['dis_att', 'dis_sinc', 'dis_intel', 'dis_fun', 'dis_amb', 'dis_sha']]
y_pred_all_ran = rf_random.predict(X_new_ran)

In [425]:
sum(y_pred_all_ran)

10518

In [426]:
len(y_pred_all_ran)

303050

In [427]:
sum(y_pred_all_ran) / len(y_pred_all_ran)

0.034707144035637683

------------------------to be updated-----------------------------------------

## Distance only considering shar and attr

In [339]:
X = dating_data_clean[['dis_att', 'dis_fun', 'dis_sha']]

rf_dis = RandomForestClassifier()

# Fit the random search model
rf_dis.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [340]:
y_pred_dis = rf_dis.predict(X)
sum(y_pred_dis)

826

In [341]:
print(confusion_matrix(y, y_pred_dis))
print(classification_report(y, y_pred_dis))

[[6918   80]
 [ 634  746]]
             precision    recall  f1-score   support

          0       0.92      0.99      0.95      6998
          1       0.90      0.54      0.68      1380

avg / total       0.91      0.91      0.91      8378



## Distance of self evaluation

In [214]:
dating_self = dating_data_clean[['iid', 'attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']].copy()
dating_self = dating_self.drop_duplicates()
dating_self

Unnamed: 0,iid,attr3_1,sinc3_1,intel3_1,fun3_1,amb3_1
0,1,0.162162,0.216216,0.216216,0.216216,0.189189
10,2,0.212121,0.151515,0.242424,0.303030,0.090909
20,3,0.190476,0.214286,0.214286,0.190476,0.190476
30,4,0.179487,0.205128,0.179487,0.230769,0.205128
40,5,0.181818,0.090909,0.303030,0.181818,0.242424
50,6,0.147059,0.205882,0.264706,0.235294,0.147059
60,7,0.193548,0.193548,0.225806,0.161290,0.225806
70,8,0.200000,0.114286,0.228571,0.228571,0.228571
80,9,0.189189,0.162162,0.189189,0.270270,0.189189
90,10,0.153846,0.205128,0.153846,0.256410,0.230769


In [215]:
dating_data_self = dating_data_clean.merge(dating_self, how='inner', left_on = 'pid', right_on = 'iid')
dating_data_self

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


Unnamed: 0,iid_x,gender,idg,partner,pid,match,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,...,dis_intel,dis_fun,dis_amb,dis_sha,iid_y,attr3_1_y,sinc3_1_y,intel3_1_y,fun3_1_y,amb3_1_y
0,1,0,1,1,11,0,0.35,0.20,0.20,0.20,...,0.000000,0.050000,0.150000,0.100000,11,0.216216,0.243243,0.216216,0.189189,0.135135
1,2,0,3,1,11,0,0.35,0.20,0.20,0.20,...,0.050000,0.000000,0.000000,0.000000,11,0.216216,0.243243,0.216216,0.189189,0.135135
2,3,0,5,1,11,0,0.35,0.20,0.20,0.20,...,0.150000,0.100000,0.100000,0.050000,11,0.216216,0.243243,0.216216,0.189189,0.135135
3,4,0,7,1,11,0,0.35,0.20,0.20,0.20,...,0.000000,0.000000,0.100000,0.050000,11,0.216216,0.243243,0.216216,0.189189,0.135135
4,5,0,9,1,11,0,0.35,0.20,0.20,0.20,...,0.050000,0.050000,0.100000,0.100000,11,0.216216,0.243243,0.216216,0.189189,0.135135
5,6,0,11,1,11,0,0.35,0.20,0.20,0.20,...,0.000000,0.050000,0.050000,0.100000,11,0.216216,0.243243,0.216216,0.189189,0.135135
6,7,0,13,1,11,0,0.35,0.20,0.20,0.20,...,0.050000,0.000000,0.150000,0.050000,11,0.216216,0.243243,0.216216,0.189189,0.135135
7,8,0,15,1,11,0,0.35,0.20,0.20,0.20,...,0.072727,0.018182,0.181818,0.040909,11,0.216216,0.243243,0.216216,0.189189,0.135135
8,9,0,17,1,11,0,0.35,0.20,0.20,0.20,...,0.000000,0.100000,0.100000,0.050000,11,0.216216,0.243243,0.216216,0.189189,0.135135
9,10,0,19,1,11,0,0.35,0.20,0.20,0.20,...,0.050000,0.200000,0.100000,0.000000,11,0.216216,0.243243,0.216216,0.189189,0.135135


In [219]:
dis_att = abs(dating_data_self['attr3_1_x'] - dating_data_self['attr3_1_y'])
dis_sinc = abs(dating_data_self['sinc3_1_x'] - dating_data_self['sinc3_1_y'])
dis_intel = abs(dating_data_self['intel3_1_x'] - dating_data_self['intel3_1_y'])
dis_fun = abs(dating_data_self['fun3_1_x'] - dating_data_self['fun3_1_y'])
dis_amb = abs(dating_data_self['amb3_1_x'] - dating_data_self['amb3_1_y'])

In [220]:
dating_data_self['dis_att_s'] = dis_att
dating_data_self['dis_sinc_s'] = dis_sinc
dating_data_self['dis_intel_s'] = dis_intel
dating_data_self['dis_fun_s'] = dis_fun
dating_data_self['dis_amb_s'] = dis_amb


In [222]:

X = dating_data_self[['dis_att_s', 'dis_sinc_s', 'dis_intel_s', 'dis_fun_s', 'dis_amb_s']]
y = dating_data_self['match']


In [223]:
# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

rf_dis = RandomForestClassifier()

# Fit the random search model
rf_dis.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [224]:
y_pred_dis = rf_dis.predict(X_test)
sum(y_pred_dis)

1303

In [225]:
print(confusion_matrix(y_test, y_pred_dis))
print(classification_report(y_test, y_pred_dis))

[[6949   49]
 [ 126 1254]]
             precision    recall  f1-score   support

          0       0.98      0.99      0.99      6998
          1       0.96      0.91      0.93      1380

avg / total       0.98      0.98      0.98      8378



In [229]:
accuracy = rf_dis.score(X, y)
accuracy

0.97911195989496302

In [226]:
X

Unnamed: 0,dis_att_s,dis_sinc_s,dis_intel_s,dis_fun_s,dis_amb_s
0,0.054054,0.027027,0.000000,0.027027,0.054054
1,0.004095,0.091728,0.026208,0.113841,0.044226
2,0.025740,0.028958,0.001931,0.001287,0.055341
3,0.036729,0.038115,0.036729,0.041580,0.069993
4,0.034398,0.152334,0.086814,0.007371,0.107289
5,0.069157,0.037361,0.048490,0.046105,0.011924
6,0.022668,0.049695,0.009590,0.027899,0.090671
7,0.016216,0.128958,0.012355,0.039382,0.093436
8,0.027027,0.081081,0.027027,0.081081,0.054054
9,0.062370,0.038115,0.062370,0.067221,0.095634
