# Speed Dating Dataset

This data was gathered from participants in experimental speed dating events from 2002-2004. During the events, the attendees would have a four-minute "first date" with every other participant of the opposite sex. At the end of their four minutes, participants were asked if they would like to see their date again. They were also asked to rate their date on six attributes: Attractiveness, Sincerity, Intelligence, Fun, Ambition, and Shared Interests. The dataset also includes questionnaire data gathered from participants at different points in the process. These fields include: demographics, dating habits, self-perception across key attributes, beliefs on what others find valuable in a mate, and lifestyle information.

There are 122 columns(independent variables) in the dataset, match column(dependent variable) needs to be predicted.

In [1]:
import pandas as pd
dating = pd.read_csv('speeddating.csv')
dating.head()

Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,race,race_o,samerace,...,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met,decision,decision_o,match
0,b'',1.0,b'female',21.0,27.0,6.0,b'[4-6]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'0',b'0'
1,b'',1.0,b'female',21.0,22.0,1.0,b'[0-1]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,5.0,b'[6-8]',b'[5-6]',1.0,b'1',b'0',b'0'
2,b'',1.0,b'female',21.0,22.0,1.0,b'[0-1]',b'Asian/Pacific Islander/Asian-American',b'Asian/Pacific Islander/Asian-American',b'1',...,b'[0-3]',b'[3-5]',7.0,,b'[6-8]',b'[0-4]',1.0,b'1',b'1',b'1'
3,b'',1.0,b'female',21.0,23.0,2.0,b'[2-3]',b'Asian/Pacific Islander/Asian-American',b'European/Caucasian-American',b'0',...,b'[0-3]',b'[3-5]',7.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'1',b'1'
4,b'',1.0,b'female',21.0,24.0,3.0,b'[2-3]',b'Asian/Pacific Islander/Asian-American',b'Latino/Hispanic American',b'0',...,b'[0-3]',b'[3-5]',6.0,6.0,b'[6-8]',b'[5-6]',0.0,b'1',b'1',b'1'


In [2]:
dating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 123 entries, has_null to match
dtypes: float64(59), object(64)
memory usage: 7.9+ MB


In [3]:
dating.shape

(8378, 123)

In [4]:
#data types in the features
dating.dtypes.value_counts()

object     64
float64    59
dtype: int64

In [5]:
#looking for columns with unique values
dating.nunique().sort_values()

has_null                        1
decision                        2
decision_o                      2
samerace                        2
match                           2
                             ... 
shared_interests_important     85
attractive_important           94
pref_o_attractive              94
interests_correlate           155
field                         260
Length: 123, dtype: int64

In [6]:
#drop the column has_null because it has only one value for all the rows
dating.drop(['has_null'], axis = 1, inplace= True)

In [7]:
#create a function that removes all the unwanted characters b', ''
def remove_characters(feature):
    return feature.replace("b'",'').replace("'","")

#select string columns
string_dataset = dating.select_dtypes(include = ['object'])

#remove the characters
for feature in string_dataset.columns:
    dating[feature] = dating[feature].apply(lambda x: remove_characters(x))

In [8]:
#the columns with the prefix d_ are the values of other columns but binned
to_drop = [column_name for column_name in dating.columns if column_name.startswith('d_')]
dating.drop(to_drop, axis = 1, inplace = True)

In [9]:
dating.shape

(8378, 66)

In [10]:
#decision and decision_o at night event are basically the same as match, match it is calculated from them
dating.drop(['decision', 'decision_o'], axis = 1, inplace= True)

In [11]:
missing_columns = dating.isnull().sum().sort_values()

In [12]:
#drop the columns that have more than the 5% of missing values
dating.drop(['expected_num_interested_in_me', 'expected_num_matches', 'shared_interests_o',
             'shared_interests_partner', 'ambitous_o', 'ambition_partner'], axis = 1, inplace= True)

In [13]:
#There's to columns of age, from self and o, we can get just one age column from diff = self - o
dating['age_diff'] = dating['age'] - dating['age_o']
dating.drop(['age','age_o'], axis = 1, inplace = True)

In [14]:
#if they had met the partner before, yes or no (1 or 0)
dating['met'].value_counts()

0.0    7644
1.0     351
7.0       3
5.0       2
3.0       1
8.0       1
6.0       1
Name: met, dtype: int64

In [15]:
#I change the few different values for the mode that is 0
for number in [3.0, 5.0, 6.0, 7.0, 8.0]:
    dating['met'].replace(number,0, inplace =True)

In [16]:
dating['met'].value_counts()

0.0    7652
1.0     351
Name: met, dtype: int64

In [17]:
#field has many different categorical values, when I convert this column into a numeric one, it sums more than 200 columns
dating.drop(['field'], axis = 1, inplace = True)

In [18]:
dating.shape

(8378, 56)

In [19]:
missing_rows=dating.isnull().sum(axis = 1)
missing_rows.value_counts()

0     7079
1      627
2      143
11     119
3       85
4       61
7       58
8       54
33      48
5       37
32      15
6        8
34       6
19       5
9        5
13       5
12       5
44       5
10       4
15       3
37       2
18       1
43       1
39       1
40       1
dtype: int64

In [20]:
dating_clean = dating.dropna()  #drop 15% of the rows

In [21]:
dating_clean.shape

(7079, 56)

In [22]:
dating_clean.isnull().sum().sum()

0

In [23]:
#numeric columns
columns_numeric = dating_clean.select_dtypes(include = ['int','float']).columns.tolist()

#categorical columns
columns_category = dating_clean.select_dtypes(include = ['object']).drop('match', axis=1).columns

In [24]:
dating_clean[columns_category]

Unnamed: 0,gender,race,race_o,samerace
0,female,Asian/Pacific Islander/Asian-American,European/Caucasian-American,0
1,female,Asian/Pacific Islander/Asian-American,European/Caucasian-American,0
3,female,Asian/Pacific Islander/Asian-American,European/Caucasian-American,0
4,female,Asian/Pacific Islander/Asian-American,Latino/Hispanic American,0
5,female,Asian/Pacific Islander/Asian-American,European/Caucasian-American,0
...,...,...,...,...
8372,male,European/Caucasian-American,European/Caucasian-American,1
8373,male,European/Caucasian-American,Latino/Hispanic American,0
8374,male,European/Caucasian-American,Other,0
8376,male,European/Caucasian-American,Asian/Pacific Islander/Asian-American,0


In [25]:
dating_clean[columns_numeric]

Unnamed: 0,wave,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,...,concerts,music,shopping,yoga,interests_correlate,expected_happy_with_sd_people,like,guess_prob_liked,met,age_diff
0,1.0,2.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,6.0,...,10.0,9.0,8.0,1.0,0.14,3.0,7.0,6.0,0.0,-6.0
1,1.0,2.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,7.0,...,10.0,9.0,8.0,1.0,0.54,3.0,7.0,5.0,1.0,-1.0
3,1.0,2.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,7.0,...,10.0,9.0,8.0,1.0,0.61,3.0,7.0,6.0,0.0,-2.0
4,1.0,2.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,8.0,...,10.0,9.0,8.0,1.0,0.21,3.0,6.0,6.0,0.0,-3.0
5,1.0,2.0,4.0,50.0,0.0,30.0,10.0,0.0,10.0,7.0,...,10.0,9.0,8.0,1.0,0.25,3.0,6.0,5.0,0.0,-4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8372,21.0,1.0,1.0,10.0,15.0,30.0,20.0,15.0,10.0,8.0,...,10.0,10.0,7.0,3.0,0.28,10.0,4.0,4.0,0.0,1.0
8373,21.0,1.0,1.0,10.0,10.0,30.0,20.0,10.0,15.0,10.0,...,10.0,10.0,7.0,3.0,0.64,10.0,2.0,5.0,0.0,-1.0
8374,21.0,1.0,1.0,50.0,20.0,10.0,5.0,10.0,5.0,6.0,...,10.0,10.0,7.0,3.0,0.71,10.0,4.0,4.0,0.0,1.0
8376,21.0,1.0,1.0,10.0,25.0,25.0,10.0,10.0,20.0,5.0,...,10.0,10.0,7.0,3.0,0.62,10.0,5.0,5.0,0.0,3.0


In [26]:
#use get dummies to convert categorical attributes into numericals
dating_ready = pd.get_dummies(data=dating_clean, columns=['gender', 'race', 'race_o', 'samerace', 'match'],drop_first=True)

In [27]:
dating_ready.shape

(7079, 62)

In [28]:
dating_ready.isnull().sum().sum()

0

In [29]:
dating_ready.dtypes.value_counts()

float64    51
uint8      11
dtype: int64

In [30]:
dating_ready.describe()

Unnamed: 0,wave,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,...,race_Black/African American,race_European/Caucasian-American,race_Latino/Hispanic American,race_Other,race_o_Black/African American,race_o_European/Caucasian-American,race_o_Latino/Hispanic American,race_o_Other,samerace_1,match_1
count,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,...,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0,7079.0
mean,11.299336,3.782738,3.65772,22.232585,17.444366,20.304576,17.490668,10.723546,11.84888,6.209549,...,0.047606,0.563921,0.077977,0.067382,0.048736,0.560107,0.080096,0.06597,0.40048,0.174318
std,5.957994,2.832566,2.81831,12.372573,6.932509,6.831764,6.092708,6.107862,6.348855,1.939503,...,0.212945,0.495932,0.268155,0.250701,0.21533,0.496409,0.271461,0.248247,0.49003,0.37941
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,1.0,1.0,15.0,15.0,17.5,15.0,5.0,9.52,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11.0,3.0,3.0,20.0,18.37,20.0,18.0,10.0,10.64,6.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,15.0,6.0,6.0,25.0,20.0,23.81,20.0,15.0,16.0,8.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,21.0,10.0,10.0,100.0,47.0,50.0,50.0,53.0,30.0,10.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
dating_ready.columns

Index(['wave', 'importance_same_race', 'importance_same_religion',
       'pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
       'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests',
       'attractive_o', 'sinsere_o', 'intelligence_o', 'funny_o',
       'attractive_important', 'sincere_important', 'intellicence_important',
       'funny_important', 'ambtition_important', 'shared_interests_important',
       'attractive', 'sincere', 'intelligence', 'funny', 'ambition',
       'attractive_partner', 'sincere_partner', 'intelligence_partner',
       'funny_partner', 'sports', 'tvsports', 'exercise', 'dining', 'museums',
       'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater',
       'movies', 'concerts', 'music', 'shopping', 'yoga',
       'interests_correlate', 'expected_happy_with_sd_people', 'like',
       'guess_prob_liked', 'met', 'age_diff', 'gender_male',
       'race_Black/African American', 'race_European/Caucasian-American',
       'race_L

# Logistic Regression and Cross Validation

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [33]:
#choose attributes to make the regression and the target
y = dating_ready[['match_1']]
X = dating_ready.drop(['match_1'], axis = 1) #all the attributes
X1 = X[['like', 'met']] 
X2 = X[['shopping', 'concerts', 'clubbing']]
X3 = X[['sports', 'tvsports', 'hiking', 'exercise']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.3, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.3, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3, random_state=42)


In [34]:
#all the attributes, cv = 5 means we split the data en 5 folds
log_reg = LogisticRegression(random_state=0)

training_X = log_reg.fit(X_train, y_train)
test_X = log_reg.score(X_test, y_test)

result_X = cross_val_score(log_reg, X_train, y_train, cv = 5)

print("Test X:" , test_X)
print("Cross validation X: ",result_X)
print("Mean X: ",result_X.mean())

Test X: 0.8483992467043314
Cross validation X:  [0.82744702 0.83047427 0.85570131 0.85671039 0.85368315]
Mean X:  0.844803229061554


In [35]:
#all the attributes, cv = 5 means we split the data en 5 folds
log_reg = LogisticRegression(random_state=0)

training_X1 = log_reg.fit(X1_train, y1_train)
test_X1 = log_reg.score(X1_test, y1_test)

result_X1 = cross_val_score(log_reg, X1_train, y1_train, cv = 5)

print("Test X1:" , test_X1)
print("Cross validation X1: ",result_X1)
print("Mean X1: ",result_X1.mean())

Test X1: 0.8225047080979284
Cross validation X1:  [0.82240161 0.82946519 0.82744702 0.83047427 0.81634712]
Mean X1:  0.8252270433905146


In [36]:
#all the attributes, cv = 5 means we split the data en 5 folds
log_reg = LogisticRegression(random_state=0)

training_X2 = log_reg.fit(X2_train, y2_train)
test_X2 = log_reg.score(X2_test, y2_test)

result_X2 = cross_val_score(log_reg, X2_train, y2_train, cv = 5)

print("Test X2:" , test_X2)
print("Cross validation X2: ",result_X2)
print("Mean X2: ",result_X2.mean())

Test X2: 0.8220338983050848
Cross validation X2:  [0.82744702 0.82744702 0.82744702 0.82744702 0.82643794]
Mean X2:  0.827245206861756


In [37]:
#all the attributes, cv = 5 means we split the data en 5 folds
log_reg = LogisticRegression(random_state=0)

training_X3 = log_reg.fit(X3_train, y3_train)
test_X3 = log_reg.score(X3_test, y3_test)

result_X3 = cross_val_score(log_reg, X3_train, y3_train, cv = 5)

print("Test X3:" , test_X3)
print("Cross validation X3: ",result_X3)
print("Mean X3: ",result_X3.mean())

Test X3: 0.8220338983050848
Cross validation X3:  [0.82744702 0.82744702 0.82744702 0.82744702 0.82643794]
Mean X3:  0.827245206861756


# Decision tree

In [38]:
from sklearn.tree import DecisionTreeClassifier

#all the attributes, cv = 5 means we split the data en 5 folds
tree = DecisionTreeClassifier(random_state=0)

training_X = tree.fit(X_train, y_train)
test_X = tree.score(X_test, y_test)

result_X = cross_val_score(tree, X_train, y_train, cv = 5)

print("Test X:" , test_X)
print("Cross validation X: ",result_X)
print("Mean X: ",result_X.mean())

Test X: 0.7777777777777778
Cross validation X:  [0.76387487 0.76185671 0.790111   0.78203835 0.79414733]
Mean X:  0.7784056508577194


In [39]:
#all the attributes, cv = 5 means we split the data en 5 folds
tree = DecisionTreeClassifier(random_state=0)

training_X1 = tree.fit(X1_train, y1_train)
test_X1 = tree.score(X1_test, y1_test)

result_X1 = cross_val_score(tree, X1_train, y1_train, cv = 5)

print("Test X1:" , test_X1)
print("Cross validation X1: ",result_X1)
print("Mean X1: ",result_X1.mean())

Test X1: 0.8253295668549906
Cross validation X1:  [0.82542886 0.82946519 0.83047427 0.82542886 0.82542886]
Mean X1:  0.827245206861756


In [40]:
#all the attributes, cv = 5 means we split the data en 5 folds
tree = DecisionTreeClassifier(random_state=0)

training_X2 = tree.fit(X2_train, y2_train)
test_X2 = tree.score(X2_test, y2_test)

result_X2 = cross_val_score(tree, X2_train, y2_train, cv = 5)

print("Test X2:" , test_X2)
print("Cross validation X2: ",result_X2)
print("Mean X2: ",result_X2.mean())

Test X2: 0.821563088512241
Cross validation X2:  [0.82542886 0.81634712 0.8284561  0.82139253 0.82240161]
Mean X2:  0.8228052472250253


In [41]:
#all the attributes, cv = 5 means we split the data en 5 folds
tree = DecisionTreeClassifier(random_state=0)

training_X3 = tree.fit(X3_train, y3_train)
test_X3 = tree.score(X3_test, y3_test)

result_X3 = cross_val_score(tree, X3_train, y3_train, cv = 5)

print("Test X3:" , test_X3)
print("Cross validation X3: ",result_X3)
print("Mean X3: ",result_X3.mean())

Test X3: 0.821563088512241
Cross validation X3:  [0.82643794 0.80726539 0.82542886 0.81432896 0.82139253]
Mean X3:  0.8189707366296671


Look for the hyperparameters of the decision tree: criterion, min_samples_leaf, max_depth and random_state.

In [42]:
from sklearn.model_selection import GridSearchCV

parameters = {'criterion': ['gini', 'entropy'], 'min_samples_leaf': [5, 10, 50, 100, 150, 200],
              'max_depth': [2, 4, 6, 8, 10, 12], 'random_state': [0, 10, 42]}

tree = DecisionTreeClassifier()

searching_X = GridSearchCV(tree, parameters, cv=5)
searching_X.fit(X_train, y_train)

searching_X1 = GridSearchCV(tree, parameters, cv=5)
searching_X1.fit(X1_train, y1_train)

searching_X2 = GridSearchCV(tree, parameters, cv=5)
searching_X2.fit(X2_train, y2_train)

searching_X3 = GridSearchCV(tree, parameters, cv=5)
searching_X3.fit(X3_train, y3_train)


print("Best parameters for X: ", searching_X.best_params_)
print("Mean for X: ", searching_X.best_score_)

print("Best parameters for X1: ", searching_X1.best_params_)
print("Mean for X1: ", searching_X1.best_score_)

print("Best parameters for X2: ", searching_X2.best_params_)
print("Mean for X2: ", searching_X2.best_score_)

print("Best parameters for X3: ", searching_X3.best_params_)
print("Mean for X3: ", searching_X3.best_score_)

Best parameters for X:  {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 50, 'random_state': 0}
Mean for X:  0.8437941473259334
Best parameters for X1:  {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 5, 'random_state': 0}
Mean for X1:  0.8286579212916246
Best parameters for X2:  {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 5, 'random_state': 0}
Mean for X2:  0.8286579212916246
Best parameters for X3:  {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'random_state': 0}
Mean for X3:  0.827245206861756


# Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier

#all the attributes, cv = 5 means we split the data en 5 folds
forest = RandomForestClassifier(random_state=0)

training_X = forest.fit(X_train, y_train)
test_X = forest.score(X_test, y_test)

result_X = cross_val_score(forest, X_train, y_train, cv = 5)

print("Test X:" , test_X)
print("Cross validation X: ",result_X)
print("Mean X: ",result_X.mean())

Test X: 0.853578154425612
Cross validation X:  [0.84460141 0.85267407 0.8506559  0.84964682 0.85469223]
Mean X:  0.8504540867810292


In [44]:
forest = RandomForestClassifier(random_state=0)

training_X1 = forest.fit(X1_train, y1_train)
test_X1 = forest.score(X1_test, y1_test)

result_X1 = cross_val_score(forest, X1_train, y1_train, cv = 5)

print("Test X1:" , test_X1)
print("Cross validation X1: ",result_X1)
print("Mean X1: ",result_X1.mean())

Test X1: 0.8253295668549906
Cross validation X1:  [0.8284561  0.82946519 0.83047427 0.82744702 0.82542886]
Mean X1:  0.8282542885973763


In [45]:
forest = RandomForestClassifier(random_state=0)

training_X2 = forest.fit(X2_train, y2_train)
test_X2 = forest.score(X2_test, y2_test)

result_X2 = cross_val_score(forest, X2_train, y2_train, cv = 5)

print("Test X2:" , test_X2)
print("Cross validation X2: ",result_X2)
print("Mean X2: ",result_X2.mean())

Test X2: 0.82015065913371
Cross validation X2:  [0.82643794 0.80928355 0.82744702 0.82038345 0.81735621]
Mean X2:  0.8201816347124117


In [46]:
forest = RandomForestClassifier(random_state=0)

training_X3 = forest.fit(X3_train, y3_train)
test_X3 = forest.score(X3_test, y3_test)

result_X3 = cross_val_score(forest, X3_train, y3_train, cv = 5)

print("Test X3:" , test_X3)
print("Cross validation X3: ",result_X3)
print("Mean X3: ",result_X3.mean())

Test X3: 0.8182674199623352
Cross validation X3:  [0.82542886 0.79818365 0.8234107  0.81029263 0.81533804]
Mean X3:  0.8145307769929364


Look for some of the hyperparameters of the random forest: n_estimators, min_samples_leaf, max_features.

In [47]:
parameters = {'min_samples_leaf': [5, 10, 50, 100, 150, 200], 
              'n_estimators': [50, 100, 150, 200], 'max_features': ['sqrt', 'log2']}

forest = RandomForestClassifier(random_state=0)

searching_X = GridSearchCV(forest, parameters, cv=5)
searching_X.fit(X_train, y_train)
searching_X1= GridSearchCV(forest, parameters, cv=5)
searching_X1.fit(X1_train, y1_train)
searching_X2 = GridSearchCV(forest, parameters, cv=5)
searching_X2.fit(X2_train, y_train)
searching_X3 = GridSearchCV(forest, parameters, cv=5)
searching_X3.fit(X3_train, y_train)

print("Best parameters for X: ", searching_X.best_params_)
print("Mean for X: ", searching_X.best_score_)

print("Best parameters for X1: ", searching_X1.best_params_)
print("Mean for X1: ", searching_X1.best_score_)

print("Best parameters for X2: ", searching_X2.best_params_)
print("Mean for X2: ", searching_X2.best_score_)

print("Best parameters for X3: ", searching_X3.best_params_)
print("Mean for X3: ", searching_X3.best_score_)

Best parameters for X:  {'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 100}
Mean for X:  0.848234106962664
Best parameters for X1:  {'max_features': 'sqrt', 'min_samples_leaf': 10, 'n_estimators': 50}
Mean for X1:  0.8286579212916246
Best parameters for X2:  {'max_features': 'sqrt', 'min_samples_leaf': 10, 'n_estimators': 50}
Mean for X2:  0.827245206861756
Best parameters for X3:  {'max_features': 'sqrt', 'min_samples_leaf': 50, 'n_estimators': 50}
Mean for X3:  0.827245206861756


# SVM

In [48]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

polynomial = SVC(kernel='poly', random_state = 0)

training_X = polynomial.fit(X_train,y_train)
test_X = polynomial.score(X_test, y_test)

result_X = cross_val_score(polynomial, X_train, y_train, cv = 5)

print("Test X:" , test_X)
print("Cross validation X: ",result_X)
print("Mean X: ",result_X.mean())

Test X: 0.8408662900188324
Cross validation X:  [0.83854692 0.83854692 0.839556   0.84157417 0.84258325]
Mean X:  0.8401614530776994


In [49]:
polynomial = SVC(kernel='poly', random_state = 0)

training_X1 = polynomial.fit(X1_train,y1_train)
test_X1 = polynomial.score(X1_test, y1_test)

result_X1 = cross_val_score(polynomial, X1_train, y1_train, cv = 5)

print("Test X1:" , test_X1)
print("Cross validation X1: ",result_X1)
print("Mean X1: ",result_X1.mean())

Test X1: 0.8220338983050848
Cross validation X1:  [0.82744702 0.82744702 0.82744702 0.82744702 0.82643794]
Mean X1:  0.827245206861756


In [50]:
polynomial = SVC(kernel='poly', random_state = 0)

training_X2 = polynomial.fit(X2_train,y2_train)
test_X2 = polynomial.score(X2_test, y2_test)

result_X2 = cross_val_score(polynomial, X2_train, y2_train, cv = 5)

print("Test X2:" , test_X2)
print("Cross validation X2: ",result_X2)
print("Mean X2: ",result_X2.mean())

Test X2: 0.8220338983050848
Cross validation X2:  [0.82744702 0.82744702 0.82744702 0.82744702 0.82643794]
Mean X2:  0.827245206861756


In [51]:
polynomial = SVC(kernel='poly', random_state = 0)

training_X3 = polynomial.fit(X3_train,y3_train)
test_X3 = polynomial.score(X3_test, y3_test)

result_X3 = cross_val_score(polynomial, X3_train, y3_train, cv = 5)

print("Test X3:" , test_X3)
print("Cross validation X3: ",result_X3)
print("Mean X3: ",result_X3.mean())

Test X3: 0.8220338983050848
Cross validation X3:  [0.82744702 0.82744702 0.82744702 0.82744702 0.82643794]
Mean X3:  0.827245206861756


Look for the hyperparameters of the polynomial kernel: $\gamma$, $r$ and $d$.

In [55]:
parameters = {'degree': [2, 3, 4],
              'gamma': [0.1, 1, 10],
              'coef0': [0, 1, 2]}

polynomial = SVC(kernel='poly', random_state = 0)

searching_X = GridSearchCV(polynomial, parameters, cv=5)
searching_X.fit(X_train, y_train)

print("Best parameters for X: ", searching_X.best_params_)
print("Mean for X: ", searching_X.best_score_)

y_predicted = searching_X.predict(X_test)
accuracy = accuracy_score(y_test, y_predicted)

# All the results

In [54]:
# X = dating_ready.drop(['match_1'], axis = 1) #all the attributes
# X1 = X[['like', 'met']] 
# X2 = X[['shopping', 'concerts', 'clubbing']]
# X3 = X[['sports', 'tvsports', 'hiking', 'exercise']]

from tabulate import tabulate 
#create data
data = [["X", 0.848, 0.844, 0.777, 0.778, 0.844, 0.854, 0.850, 0.848, 0.841, 0.840],
        ["X1", 0.822, 0.825, 0.825, 0.827, 0.829, 0.825, 0.828, 0.829, 0.822, 0.827],
        ["X2", 0.822, 0.827, 0.822, 0.823, 0.829, 0.820, 0.820, 0.827, 0.822, 0.827],
        ["X3", 0.822, 0.827, 0.822, 0.819, 0.827, 0.818, 0.814, 0.827, 0.822, 0.827]
        ]
  
#define header names
col_names = ["Features", "Log Reg", "Log Reg CV", "Dec Tree", "Dec Tree CV", "Dec Tree Hyp",
             "Rand Forest", "Rand Forest CV", "Rand Forest Hyp", "Poly", "Poly CV"]
  
#display table
print(tabulate(data, headers=col_names))

Features      Log Reg    Log Reg CV    Dec Tree    Dec Tree CV    Dec Tree Hyp    Rand Forest    Rand Forest CV    Rand Forest Hyp    Poly    Poly CV
----------  ---------  ------------  ----------  -------------  --------------  -------------  ----------------  -----------------  ------  ---------
X               0.848         0.844       0.777          0.778           0.844          0.854             0.85               0.848   0.841      0.84
X1              0.822         0.825       0.825          0.827           0.829          0.825             0.828              0.829   0.822      0.827
X2              0.822         0.827       0.822          0.823           0.829          0.82              0.82               0.827   0.822      0.827
X3              0.822         0.827       0.822          0.819           0.827          0.818             0.814              0.827   0.822      0.827
