# Speed Dating Data Exploratory Data Analysis


In [None]:
#First let's import the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import os
from IPython.display import display, HTML

pd.set_option('display.max_columns', 500)

In [None]:
#Specifying the Data Path

cwd = os.getcwd()
file_path = os.path.join(cwd, 'cleaned_speed_dating.csv')

Here, we explore some basic dataset details.

In [None]:
#Reading the CSV, as easy as this! df stands for dataframe
df=pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
display(df.describe())

### Correlation

We calculate the correlation matrix of the dataframe.

In [None]:
corr = df.corr()
display(corr)

In [None]:
corr_tmp = corr.copy()
shape = corr_tmp.values.shape

#Setting upper triangular matrix including diagonal (std) to -99, discounting them
bad_indices = np.triu_indices(shape[0])
corr_tmp.values[bad_indices] = -99

arg_corr = np.dstack(np.unravel_index(np.argsort(-corr_tmp.values.ravel()), shape))[0]

arg_corr = arg_corr[:-len(bad_indices[0])] 

In [None]:
print('Highest Correlation Coefficient\n')

for pair in corr_tmp.columns[arg_corr]:
    print(pair, ' Correlation = ', corr[pair[0]][pair[1]])
    
print('Lowest Correlation Coefficient\n')    

Look at the correlations above. Are there any results that you weren't expecting?

### Data Visualization

In [None]:
from mpl_toolkits.mplot3d import Axes3D

def plotTwoFeatures(f1, f2):
    fig = plt.figure()
    plt.plot(df[f1], df[f2], 'o')
    plt.xlabel(f1)
    plt.ylabel(f2)



In [None]:
#Hard to see for first two, these are only integers

plotTwoFeatures('art', 'museums')
plotTwoFeatures('intel_partner', 'sinc_partner')
plotTwoFeatures('amb_want', 'attr_want')

In [None]:
# Examine the proportion of acceptance by gender
emale_df = df.loc[df['gender'] == 0]
male_df = df.loc[df['gender'] == 1]

yes_female_df = female_df.query('dec == 1')
no_female_df = female_df.query('dec == 0')
print('Proportion of acceptance by females is {}'.format(yes_female_df.shape[0]/female_df.shape[0]))

yes_male_df =male_df.query('dec == 1')
no_male_df = male_df.query('dec == 0')
print('Proportion of acceptance by males is {}'.format(yes_male_df.shape[0]/male_df.shape[0]))

In [None]:
def plot_feature(feature):
    fig = plt.figure(figsize=(8, 8))
    fig.subplots_adjust(hspace=0.5)
    
    ax = plt.subplot(211)
    ax.set_title('Female')
    ax.set_xlabel(feature)
    x = [no_female_df[feature].values]
    y = [yes_female_df[feature].values]
    plt.hist([x, y], label=['No','Yes'], bins=11)
    ax.legend()
    
    ax = plt.subplot(212)
    ax.set_title('Male')
    ax.set_xlabel(feature)
    x = [no_male_df[feature].values]
    y = [yes_male_df[feature].values]
    plt.hist([x, y], label=['No','Yes'], bins=11)
    ax.legend()
    plt.show()

In [None]:
#How do the distributions of certain features look like for 'yes' decision and 'no' decision

plot_feature('attr_partner')

In [None]:
plot_feature('intel_partner')

In [None]:
plot_feature('shar_partner')

Any more cool plots you can think of? Fiddle around with the dataset.