# Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Data Extraction

We first take a look at the Lovoo page on Kaggle which was published on 2020, which has two datasets. However, it is observed that the users dataset is a subset of the api dataset, so we will just use the api dataset.

In [2]:
data = pd.read_csv('lovoo_v3_users_api-results.csv')
data.head()

Unnamed: 0,gender,genderLooking,age,name,counts_details,counts_pictures,counts_profileVisits,counts_kisses,counts_fans,counts_g,...,shareProfileEnabled,lastOnlineDate,lastOnlineTime,birthd,crypt,freetext,whazzup,userId,pictureId,isSystemProfile
0,F,M,25,daeni,1.0,4,8279,239,0,3,...,1,2015-04-25T20:43:26Z,1429995000.0,0,,Nur tote Fisch schwimmen mit dem Strom,Nur tote fische schwimmen mit dem strom,4e3842f79b70e7ea57000064,55303fc3160ba0eb728b4575,
1,F,M,22,italiana 92,0.85,5,663,13,0,0,...,1,2015-04-26T09:19:35Z,1430040000.0,0,,,Primaveraaa<3,4e3d34bf5d2bce7b160006a3,552e7b61c66da10d1e8b4c82,
2,F,M,21,Lauraaa,0.0,4,1369,88,0,2,...,1,2015-04-06T14:24:07Z,1428330000.0,0,,,,4ec96c090dc82cb25d00000a,54a584ecc56da128638b4674,
3,F,none,20,Qqkwmdowlo,0.12,3,22187,1015,2,3,...,1,2015-04-07T11:21:01Z,1428406000.0,0,,,Je pense donc je suis. Instagram quedev,4eef8b81ebf2c8f64000000c,54c92738076ea1b5338b4735,
4,F,M,21,schaessie {3,0.15,12,35262,1413,9,12,...,1,2015-04-06T14:25:20Z,1428330000.0,0,,,Instagram: JESSSIESCH,4ef3cc5aa9d0b3d07d000017,54e1a6f6c76da135748b4a3a,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3992 entries, 0 to 3991
Data columns (total 42 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  3992 non-null   object 
 1   genderLooking           3992 non-null   object 
 2   age                     3992 non-null   int64  
 3   name                    3992 non-null   object 
 4   counts_details          3992 non-null   float64
 5   counts_pictures         3992 non-null   int64  
 6   counts_profileVisits    3992 non-null   int64  
 7   counts_kisses           3992 non-null   int64  
 8   counts_fans             3992 non-null   int64  
 9   counts_g                3992 non-null   int64  
 10  flirtInterests_chat     3992 non-null   bool   
 11  flirtInterests_friends  3992 non-null   bool   
 12  flirtInterests_date     3992 non-null   bool   
 13  country                 3992 non-null   object 
 14  city                    3706 non-null   

# Data Cleaning

After taking a look at the columns and data points, there are some values which can be measured more accurately using other variables, like distance to measure location. There are also some values which are pretty hard to analyze without NLP, such as a profile's bio and catchphrase, so we'll be dropping all of these.

In [4]:
data.columns

Index(['gender', 'genderLooking', 'age', 'name', 'counts_details',
       'counts_pictures', 'counts_profileVisits', 'counts_kisses',
       'counts_fans', 'counts_g', 'flirtInterests_chat',
       'flirtInterests_friends', 'flirtInterests_date', 'country', 'city',
       'location', 'distance', 'isFlirtstar', 'isHighlighted', 'isInfluencer',
       'isMobile', 'isNew', 'isOnline', 'isVip', 'lang_count', 'lang_fr',
       'lang_en', 'lang_de', 'lang_it', 'lang_es', 'lang_pt', 'verified',
       'shareProfileEnabled', 'lastOnlineDate', 'lastOnlineTime', 'birthd',
       'crypt', 'freetext', 'whazzup', 'userId', 'pictureId',
       'isSystemProfile'],
      dtype='object')

In [9]:
drop = ['city', 'location','pictureId','isMobile','isSystemProfile', 'lang_fr','lang_en', 'lang_de', 'lang_it', 'lang_es', 'lang_pt', 'birthd', 'counts_details', 'crypt', 'freetext', 'whazzup']
data2 = data.drop(columns = drop)
data2

Unnamed: 0,gender,genderLooking,age,name,counts_pictures,counts_profileVisits,counts_kisses,counts_fans,counts_g,flirtInterests_chat,...,isInfluencer,isNew,isOnline,isVip,lang_count,verified,shareProfileEnabled,lastOnlineDate,lastOnlineTime,userId
0,F,M,25,daeni,4,8279,239,0,3,True,...,0,0,0,0,1,0,1,2015-04-25T20:43:26Z,1.429995e+09,4e3842f79b70e7ea57000064
1,F,M,22,italiana 92,5,663,13,0,0,True,...,0,0,0,0,3,0,1,2015-04-26T09:19:35Z,1.430040e+09,4e3d34bf5d2bce7b160006a3
2,F,M,21,Lauraaa,4,1369,88,0,2,False,...,0,0,1,0,0,0,1,2015-04-06T14:24:07Z,1.428330e+09,4ec96c090dc82cb25d00000a
3,F,none,20,Qqkwmdowlo,3,22187,1015,2,3,True,...,0,0,0,0,2,0,1,2015-04-07T11:21:01Z,1.428406e+09,4eef8b81ebf2c8f64000000c
4,F,M,21,schaessie {3,12,35262,1413,9,12,True,...,0,0,1,0,1,0,1,2015-04-06T14:25:20Z,1.428330e+09,4ef3cc5aa9d0b3d07d000017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3987,F,M,22,Ser**a,6,1003,98,0,0,False,...,0,0,0,0,1,0,1,2015-04-05T07:13:49Z,1.428218e+09,CRYu9vQnJ8lHU2pVpimKJGIyTHqR73rAu8hg21rtn340Ys...
3988,F,M,24,Mon**a,7,6890,563,0,0,False,...,0,0,1,0,1,1,1,2015-04-19T11:00:59Z,1.429441e+09,CRYuikFTcdFDsGDJugu3PyvSonM1LYwyviJXvZBUayBMgI...
3989,F,M,20,Fa**y,4,1157,52,1,0,False,...,0,0,0,0,1,0,1,2015-04-19T08:37:52Z,1.429433e+09,CRYv9Qg5MTSHaqrTK2CMe9cPRNACAFdYDi7BEgoQTyljzz...
3990,F,M,20,Fa**y,4,1157,52,1,0,False,...,0,0,0,0,1,0,1,2015-04-19T08:37:52Z,1.429433e+09,CRYwD9ZmbjDXJu8li2DPXI0UseBJt2lovQBWnbKcGDVVcs...


In [10]:
data2.isna().sum()

gender                     0
genderLooking              0
age                        0
name                       0
counts_pictures            0
counts_profileVisits       0
counts_kisses              0
counts_fans                0
counts_g                   0
flirtInterests_chat        0
flirtInterests_friends     0
flirtInterests_date        0
country                    0
distance                  46
isFlirtstar                0
isHighlighted              0
isInfluencer               0
isNew                      0
isOnline                   0
isVip                      0
lang_count                 0
verified                   0
shareProfileEnabled        0
lastOnlineDate             1
lastOnlineTime             1
userId                     0
dtype: int64

Since there are values missing from distance, we fill it in using the median of the values

In [17]:
data2.distance = data2.distance.fillna(data2.distance.median())

## Now our dataset is ready to be used.

In [18]:
data2.to_csv('cleanedLovooData.csv')