# Import Necessary Libraries

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Extraction

We first take a look at the Lovoo page on Kaggle which was published on 2020, which has two datasets. However, it is observed that the users dataset is a subset of the api dataset, so we will just use the api dataset.

In [18]:
data = pd.read_csv('lovoo_v3_users_api-results.csv')
data.head()

Unnamed: 0,gender,genderLooking,age,name,counts_details,counts_pictures,counts_profileVisits,counts_kisses,counts_fans,counts_g,...,shareProfileEnabled,lastOnlineDate,lastOnlineTime,birthd,crypt,freetext,whazzup,userId,pictureId,isSystemProfile
0,F,M,25,daeni,1.0,4,8279,239,0,3,...,1,2015-04-25T20:43:26Z,1429995000.0,0,,Nur tote Fisch schwimmen mit dem Strom,Nur tote fische schwimmen mit dem strom,4e3842f79b70e7ea57000064,55303fc3160ba0eb728b4575,
1,F,M,22,italiana 92,0.85,5,663,13,0,0,...,1,2015-04-26T09:19:35Z,1430040000.0,0,,,Primaveraaa<3,4e3d34bf5d2bce7b160006a3,552e7b61c66da10d1e8b4c82,
2,F,M,21,Lauraaa,0.0,4,1369,88,0,2,...,1,2015-04-06T14:24:07Z,1428330000.0,0,,,,4ec96c090dc82cb25d00000a,54a584ecc56da128638b4674,
3,F,none,20,Qqkwmdowlo,0.12,3,22187,1015,2,3,...,1,2015-04-07T11:21:01Z,1428406000.0,0,,,Je pense donc je suis. Instagram quedev,4eef8b81ebf2c8f64000000c,54c92738076ea1b5338b4735,
4,F,M,21,schaessie {3,0.15,12,35262,1413,9,12,...,1,2015-04-06T14:25:20Z,1428330000.0,0,,,Instagram: JESSSIESCH,4ef3cc5aa9d0b3d07d000017,54e1a6f6c76da135748b4a3a,


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3992 entries, 0 to 3991
Data columns (total 42 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  3992 non-null   object 
 1   genderLooking           3992 non-null   object 
 2   age                     3992 non-null   int64  
 3   name                    3992 non-null   object 
 4   counts_details          3992 non-null   float64
 5   counts_pictures         3992 non-null   int64  
 6   counts_profileVisits    3992 non-null   int64  
 7   counts_kisses           3992 non-null   int64  
 8   counts_fans             3992 non-null   int64  
 9   counts_g                3992 non-null   int64  
 10  flirtInterests_chat     3992 non-null   bool   
 11  flirtInterests_friends  3992 non-null   bool   
 12  flirtInterests_date     3992 non-null   bool   
 13  country                 3992 non-null   object 
 14  city                    3706 non-null   

# Data Cleaning

After taking a look at the columns and data points, there are some values which are pretty much the same, such as city and location. There are also some values which are pretty hard to analyze without NLP, such as a profile's bio and catchphrase, so we'll be dropping all of these.

In [20]:
data.isna().sum()

gender                       0
genderLooking                0
age                          0
name                         0
counts_details               0
counts_pictures              0
counts_profileVisits         0
counts_kisses                0
counts_fans                  0
counts_g                     0
flirtInterests_chat          0
flirtInterests_friends       0
flirtInterests_date          0
country                      0
city                       286
location                    13
distance                    46
isFlirtstar                  0
isHighlighted                0
isInfluencer                 0
isMobile                     0
isNew                        0
isOnline                     0
isVip                        0
lang_count                   0
lang_fr                      0
lang_en                      0
lang_de                      0
lang_it                      0
lang_es                      0
lang_pt                      0
verified                     0
sharePro

In [21]:
# drop all columns with nan value

df = df.dropna(axis = 1)
df

Unnamed: 0,gender,genderLooking,age,name,counts_details,counts_pictures,counts_profileVisits,counts_kisses,counts_fans,counts_g,...,isVIP,isVerified,countDetails,flirtstar,freshman,hasBirthday,highlighted,locked,mobile,online
0,F,M,25,daeni,1.00,4,8279,239,0,3,...,False,False,1.00,False,False,False,False,False,True,False
1,F,M,22,italiana 92,0.85,5,663,13,0,0,...,False,False,0.85,False,False,False,False,False,True,False
2,F,none,20,Qqkwmdowlo,0.12,3,22187,1015,2,3,...,False,False,0.12,False,False,False,False,False,False,False
3,F,M,24,Baby dee,0.81,18,7339,180,0,2,...,False,False,0.81,False,False,False,False,False,True,True
4,F,none,24,Anna,0.65,13,18672,492,0,1,...,False,False,0.65,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,F,M,22,Ser**a,0.85,6,1003,98,0,0,...,False,False,0.85,False,False,False,False,False,True,False
2920,F,M,24,Mon**a,1.00,7,6890,563,0,0,...,False,True,1.00,False,False,False,False,False,True,True
2921,F,M,20,Fa**y,0.90,4,1157,52,1,0,...,False,False,0.90,False,False,False,False,False,True,False
2922,F,M,20,Fa**y,0.90,4,1157,52,1,0,...,False,False,0.90,False,False,False,False,False,True,False


## Now our dataset is ready to be used.