In [38]:
import pandas as pd
import numpy as np
import json

from sklearn.neighbors import KNeighborsClassifier
from opencage.geocoder import OpenCageGeocode

from emoji import UNICODE_EMOJI

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [39]:
csv_filename = 'data/tinder_profile_data.csv'

data = pd.read_csv(csv_filename)

data = data.drop(columns = ['anthem','profile_pic_urls']) #Dropped these as no data was gathered 

empty_entries = data.name.isna()
data = data[-empty_entries]
duplicate_entries = data.duplicated()
data = data[-duplicate_entries]

In [40]:
data.name = data.name.apply(lambda x: x.capitalize())
data.age = data.age.apply(lambda x: int(x) if not np.isnan(x) else x)
data.city = data.city.apply(lambda x: x[9:] if type(x) != float else x)
data.distance = data.distance.apply(lambda x: int(x.split(' ')[0]) if type(x) != float else x)
data.head(20)

Unnamed: 0,name,age,college,job,city,gender,distance,details
0,Miranda,19.0,Concordia University,,,,3722.0,From BC and I need to learn French ASAP\nInsta...
2,Katherine,19.0,John Abbott College,,,,3706.0,Single and ready to mingle\n •reina del perreo...
3,Katie,19.0,,,,,3584.0,just a cupcake looking for a stud muffin \n18
4,Megan,19.0,McMaster University,,,Woman,3613.0,
5,Abby,18.0,,,,,3616.0,Insta:abby.linton
6,Eden,18.0,Vanier College,,Montréal,,3721.0,Lisa is my constant vibe
7,Angelika,18.0,,,Pembroke,,3455.0,
8,Chloé,20.0,,,Montréal,,3677.0,On diras qu’on s’est rencontré a l’épicerie? 😏...
9,Laurie,18.0,,,,,3719.0,
10,Alexia,21.0,,,,,3703.0,🇨🇦 🇮🇹 Insta: alexia_lebrun\nSc: loulou_loveyou


In [41]:
total = len(data)

def summary(data):
    print(10*"-" + "STATS" + 10*"-")
    print("{} TOTAL ENTRIES GATHERED".format(total))
    list = [data.age, data.college, data.job, data.city, data.gender, data.distance, data.details]
    names = ['AGE', 'COLLEGE', 'JOB', 'CITY', 'GENDER', 'DISTANCE', 'DETAILS']
    n = 0
    for each in list:
        nums = total - each.isna().sum()
        print(str(nums) + " ENTRIES HAVE " + names[n] + " DATA (" + str(round(100*nums/total,1)) + "%)")
        n += 1

    print(25*"-")
    avg_age = data.age.mean()
    print("AVERAGE AGE IN DATA SET: {}".format(int(avg_age)))
    unique_college = len(data.college.unique())
    print("NUMBER OF UNIQUE COLLEGES: {}".format(unique_college))
    unique_jobs = len(data.job.unique())
    print("NUMBER OF UNIQUE JOBS: {}".format(unique_jobs))
    unique_cities = len(data.city.unique())
    print("NUMBER OF UNIQUE CITIES: {}".format(unique_cities))

summary(data)

----------STATS----------
2497 TOTAL ENTRIES GATHERED
2493 ENTRIES HAVE AGE DATA (99.8%)
1123 ENTRIES HAVE COLLEGE DATA (45.0%)
311 ENTRIES HAVE JOB DATA (12.5%)
623 ENTRIES HAVE CITY DATA (24.9%)
391 ENTRIES HAVE GENDER DATA (15.7%)
2487 ENTRIES HAVE DISTANCE DATA (99.6%)
1733 ENTRIES HAVE DETAILS DATA (69.4%)
-------------------------
AVERAGE AGE IN DATA SET: 19
NUMBER OF UNIQUE COLLEGES: 337
NUMBER OF UNIQUE JOBS: 265
NUMBER OF UNIQUE CITIES: 163


In [42]:
data.city.isna().sum()

1874

In [43]:
unique_cities = pd.unique(data.city)
for each in unique_cities:
    print(each)

nan
Montréal
Pembroke
Ottawa
Gatineau
Ogdensburg
Kingston
Vaudreuil-Dorion
Saint-Jérôme
Laval
Grand-Mère
Rivière-du-Loup
Drummondville
Shawinigan
Québec
Quebec City
Rimouski
Grande-Vallée
Lévis
Notre-Dame-du-Mont-Carmel
Saint-Romuald
Boischatel
Trois-Rivières
Victoriaville
Winnipeg
Thompson
Emerson
Steinbach
Regina
Kipling
Weyburn
Oakville
Minneapolis
Niagara Falls
Hamilton
Toronto
Mississauga
Erin
St. Catharines
Aurora
Vaughan
Welland
Georgetown
Oshawa
Guelph
Arctic Bay
Whitby
Kelowna
Whitchurch-Stouffville
Halton Hills
Burlington
Santa Barbara
Richmond Hill
Cambridge
Scugog
ברי
Keswick
Calgary
New York
Barrie
Ancaster
Malibu
St Catharines
Niagara-on-the-Lake
Pickering
Buffalo
Newmarket
Windsor
London
Brampton
Madrid
Orangeville
Bradford
Bowmanville
Saint-Simon
Saint-Jean-sur-Richelieu
Mascouche
Sainte-Catherine
Sainte-Julie
Saint-Hyacinthe
L'Assomption
Carignan
Saint-Dominique
Terrebonne
Rawdon
Blainville
Saint-Basile-le-Grand
Boucherville
Mercier
Salaberry-de-Valleyfield
Farnham
Poi

In [44]:
type(data.city)

pandas.core.series.Series

In [45]:
data.city.isna().sum()

1874

In [46]:
def fill_city(series,city_list,KNN_model):
    if type(series.city)==float and ~np.isnan(series.distance):
        try:
            city = np.random.choice(city_list[series.distance],1, replace=True)[0]
        except:
            if ~np.isnan(series.distance):
                city = KNN_model.predict(np.array([[series.distance]]))[0]
            else:
                city = np.nan
    else:
        city = series.city
    return city

In [47]:
def fill_missing_cities(data):
    print("FILLING MISSING CITY VALUES")
    num = data.city.isna().sum()
    print("FOUND {} MISSING VALUES".format(num))
    filtered = data[-data.city.isna()].copy()
    filtered = filtered[-filtered.distance.isna()]
    city_list = {}
    for each in filtered.distance.unique():
        city_list[each] = filtered[filtered.distance == each].city

    #KNN for remaining values:
    X = np.array(filtered.distance).reshape(-1,1)
    y = np.array(filtered.city)
    KNN_City = KNeighborsClassifier(n_neighbors=10).fit(X,y)
    
    data.city = data.apply(lambda x: fill_city(x,city_list,KNN_City), axis =1)
    new_num = data.city.isna().sum()
    print("{} MISSING CITY VALUES REMAIN ({}%)".format(new_num,round(new_num/len(data),2)))
    
    return data.city

In [48]:
cit = fill_missing_cities(data)


FILLING MISSING CITY VALUES
FOUND 1874 MISSING VALUES
7 MISSING CITY VALUES REMAIN (0.0%)


In [62]:
cit[15]

'Joliette'