# Dating App Reviews: Exploring Dataset and Data Wrangling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dating_apps = pd.read_csv("DatingAppReviewsDataset.csv")
dating_apps.head()

Unnamed: 0.1,Unnamed: 0,Name,Review,Rating,#ThumbsUp,Date&Time,App
0,0,linah sibanda,On this app i cant find a partner,5,0,18-02-2022 01:19,Tinder
1,1,Norman Johnson,Tinder would be so much better if we could spe...,3,0,18-02-2022 01:16,Tinder
2,2,David Hume,Still doesn't correctly notify matches or mess...,1,0,18-02-2022 01:11,Tinder
3,3,Last 1 Standing,"Got banned because I updated my bio to say ""I ...",2,0,18-02-2022 01:11,Tinder
4,4,Arthur Magamedov,Love it!,5,0,18-02-2022 01:06,Tinder


In [3]:
dating_apps.shape

(681994, 7)

In [4]:
dating_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681994 entries, 0 to 681993
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  681994 non-null  int64 
 1   Name        681989 non-null  object
 2   Review      680642 non-null  object
 3   Rating      681994 non-null  int64 
 4   #ThumbsUp   681994 non-null  int64 
 5   Date&Time   681994 non-null  object
 6   App         681994 non-null  object
dtypes: int64(3), object(4)
memory usage: 36.4+ MB


In [5]:
#Notice that Date&Time column is not type:datetime 
dating_apps['Date&Time']= pd.to_datetime(dating_apps['Date&Time'])
dating_apps.dtypes

Unnamed: 0             int64
Name                  object
Review                object
Rating                 int64
#ThumbsUp              int64
Date&Time     datetime64[ns]
App                   object
dtype: object

In [6]:
#Checking for null values
missing_data = dating_apps.isnull()
missing_data.value_counts()
#Notice that there exist null values for Name & Review

Unnamed: 0  Name   Review  Rating  #ThumbsUp  Date&Time  App  
False       False  False   False   False      False      False    680637
                   True    False   False      False      False      1352
            True   False   False   False      False      False         5
dtype: int64

In [7]:
# However, Name & Review is a string, and it does not effect my findings
# and none of numerical values are missing, so we just leave the missing values for now
print(missing_data['Name'].value_counts()[1], "missing values in Name column")
print(missing_data['Review'].value_counts()[1], "missing values in Review column")

5 missing values in Name column
1352 missing values in Review column


In [8]:
#No duplicate data exists
dating_apps=dating_apps.drop_duplicates()
dating_apps.shape[0]

681994

In [9]:
#Error in data value
#Notice that there is value 0 in Rating
dating_apps['Rating'].value_counts()

1    251969
5    233464
4     87347
3     56958
2     52255
0         1
Name: Rating, dtype: int64

In [10]:
dating_apps.loc[~dating_apps['Rating'].isin((1,2,3,4,5))]

Unnamed: 0.1,Unnamed: 0,Name,Review,Rating,#ThumbsUp,Date&Time,App
272613,272613,Rahul soren,Love it but ...my match is not replying its a ...,0,0,2018-05-27 23:45:00,Tinder


In [11]:
#Let's see the Review for this value
print(list(dating_apps.loc[~dating_apps['Rating'].isin((1,2,3,4,5))]['Review'])[0])

Love it but ...my match is not replying its a prblm on app or smthing else 
please tell me


In [12]:
#Replace the Rating:0 to value that contains most value in Tinder
dating_apps['Rating'].replace(0, dating_apps.loc[dating_apps['App'] == 'Tinder']['Rating'].value_counts()
                              .idxmax(), inplace=True)
dating_apps['Rating'].value_counts()

1    251970
5    233464
4     87347
3     56958
2     52255
Name: Rating, dtype: int64

In [13]:
dating_apps.to_csv('Dating_Apps.csv', index=False)