# Nearest Neighbours

#### Here we have a function which allows us to find the nearest neighbours of a person given either their attributes, or their iid number

In [195]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [196]:
df = pd.read_csv('../data/Speed Dating Data.csv')
df.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


In [197]:
# need this to get age in the same format (1-10) as other attributes. 
# Can also go from 1-10 to original age if direction='backward'
def rescale_age(age, direction="forward"):
    ser = df['age'] 
    if direction.lower() == 'forward':
        result = (10 - 1) / (ser.max() - ser.min()) * (age - ser.min()) + 1
    elif direction.lower() == 'backward':     
        result = (age - 1) * (ser.max() - ser.min()) / (10. - 1) + ser.min()
    return result

In [198]:
# Get a subsetted dataframe grouped by iid
cols = ['iid', 'gender']
atts = ['age', 'attr_o', 'sinc_o', 'intel_o', 'fun_o', 'amb_o']
model_df = df[atts+cols]
# rescale age to be between 1 and 10
model_df['age'] = rescale_age(model_df['age'])

model_df = model_df.dropna(how='all')
model_df = model_df.groupby('iid').mean()
model_df.reset_index(inplace=True)
model_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,iid,age,attr_o,sinc_o,intel_o,fun_o,amb_o,gender
0,1,1.72973,6.7,7.4,8.0,7.2,8.0,0
1,2,2.459459,7.7,7.1,7.9,7.5,7.5,0
2,3,2.702703,6.5,7.1,7.3,6.2,7.111111,0
3,4,2.216216,7.0,7.1,7.7,7.5,7.7,0
4,5,1.72973,5.3,7.7,7.6,7.2,7.8,0


In [199]:
# this function takes in a pandas series of attributes OR an iid and returns a new dataframe ordered by the people who are most
# similar to a person with those attributes or iid
def nearest_neighbours(att_series=pd.Series(), iid_val=None):
    if not att_series.empty:
        # subsets model_df to a df with only the same sex as in att_series
        gender_df = model_df[model_df['gender'] == att_series['gender']]
        # rescale age to 1-10
        att_series['age'] = rescale_age(att_series['age'])
        
        sub_att_series = att_series.drop('gender', axis=0)
        difference_2Darray = gender_df[atts].values - sub_att_series.values
        
    elif iid_val:
        iid_gender = model_df.loc[model_df['iid'] == iid_val, 'gender']
        gender_df = model_df[model_df['gender'] == iid_gender]
        
        difference_2Darray = (gender_df[atts].values - model_df[model_df['iid'] == iid_val][atts].values)
    else:
        print "You didn't enter an iid value OR an attribute series"
        
    distance_1Darray = np.sum(np.square(difference_2Darray), axis=1)

    df2=gender_df.copy()
    df2['dist'] = distance_1Darray
    df2['age'] = rescale_age(df2['age'], 'backward')
    df2.sort_values('dist', inplace=True)
    return df2

In [200]:
# Just change these Series values if you want to play around
my_atts = pd.Series([0, 38, 7, 8, 6, 4, 5], index=['gender', 'age', 'attr', 'sinc', 'intel', 'fun', 'amb'])
# my_iid = 10

nearest_neighbours(att_series=my_atts).head()

Unnamed: 0,iid,age,attr_o,sinc_o,intel_o,fun_o,amb_o,gender,dist
349,351,35.0,6.882353,7.411765,7.058824,5.529412,6.117647,0,5.087466
237,239,34.0,5.777778,7.555556,7.222222,5.333333,5.5,0,5.22465
350,352,29.0,5.888889,7.944444,7.0,5.277778,5.5,0,5.874205
263,265,32.0,5.0,6.47619,6.95,3.85,5.45,0,7.803038
194,196,25.0,6.35,6.95,6.65,4.75,5.25,0,7.850075


In [201]:
# Now if you want to find more information about the person of a particular iid, just search in the original df:
# df[df['iid'] == *enter iid*]