In [None]:
# Import libraries
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('ggplot')

import ast
import contractions
import emoji
import re
import string 
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm 
from nltk.tokenize import word_tokenize



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Reload the Data
df = pd.read_csv('dataset/cleaned_DatingAppReviewsDataset.csv')

df.head()
df.count()
df.info()

Unnamed: 0,Id,Name,Review,Date&Time,App,Sentiment
0,111028,Jasper Ancajas,I have a hard time logging in so i uninstalled...,2020-05-28 19:28:00,Tinder,negative
1,211245,Tiger 181,im getting no match and like...in stead of spe...,2019-03-03 21:08:00,Tinder,negative
2,9624,Mr 21,No good,2021-02-03 21:38:00,Hinge,negative
3,198692,Tim Stone,"got banned for no reason, paid customer for ye...",2019-04-20 13:56:00,Tinder,negative
4,27599,Thelma Barbara,The more I use this app the more I am appalled...,2021-08-31 17:19:00,Tinder,negative


Id           50000
Name         50000
Review       50000
Date&Time    50000
App          50000
Sentiment    50000
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         50000 non-null  int64 
 1   Name       50000 non-null  object
 2   Review     50000 non-null  object
 3   Date&Time  50000 non-null  object
 4   App        50000 non-null  object
 5   Sentiment  50000 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.3+ MB


## Data Preprocessing

Preprocessing is a critical first step in any natural language processing (NLP) task. Raw text data is often noisy, containing inconsistent capitalization, unnecessary punctuation, stopwords, emojis, and other irrelevant elements. Without cleaning and standardizing the text, machine learning models struggle to identify meaningful patterns. Effective preprocessing helps by normalizing text, reducing noise, and ensuring that the model focuses only on the content that matters for the specific task — in this case, sentiment classification. https://medium.com/@maleeshadesilva21/preprocessing-steps-for-natural-language-processing-nlp-a-beginners-guide-d6d9bf7689c9 

For this project, the preprocessing involves several key steps:

In [3]:
# Count how many reviews contain URLs
contains_url = df['Review'].str.contains(r'http\S+', regex=True, na=False)
print(f"URLs: {contains_url.sum()}")

# Count how many reviews contain hashtags
contains_hashtag = df['Review'].str.contains(r'#\w+', regex=True, na=False)
print(f"Hashtags: {contains_hashtag.sum()}")

# Check if a review contains any non-ASCII characters[True]
df['Non_ASCII_Flag'] = df['Review'].apply(lambda x: bool(re.search(r'[^\x00-\x7F]', str(x))))
print(df['Non_ASCII_Flag'].value_counts())

URLs: 0
Hashtags: 68
Non_ASCII_Flag
False    49585
True       415
Name: count, dtype: int64


In [4]:
# Remove Unwanted Elements, like URLs, hashtags, non-ASCII characters
df['Review'] = df['Review'].apply(lambda x: re.sub(r'http\S+|@\w+|#\w+|[^\x00-\x7F]+', '', x))
df.head()

Unnamed: 0,Id,Name,Review,Date&Time,App,Sentiment,Non_ASCII_Flag
0,111028,Jasper Ancajas,I have a hard time logging in so i uninstalled...,2020-05-28 19:28:00,Tinder,negative,False
1,211245,Tiger 181,im getting no match and like...in stead of spe...,2019-03-03 21:08:00,Tinder,negative,False
2,9624,Mr 21,No good,2021-02-03 21:38:00,Hinge,negative,False
3,198692,Tim Stone,"got banned for no reason, paid customer for ye...",2019-04-20 13:56:00,Tinder,negative,False
4,27599,Thelma Barbara,The more I use this app the more I am appalled...,2021-08-31 17:19:00,Tinder,negative,False


Because the reviews are informal, many of them contain contractions like "can't", "it's", or "didn't". Expanding these into their full forms ("cannot", "it is", "did not") helps the model interpret them more accurately. This is especially important for sentiment analysis, since contractions often include negations (like "not") that directly influence sentiment.

In [5]:
# Expand contractions like like can't, it's
df['Review'] = df['Review'].apply(contractions.fix)

Lowercasing ensures the model treats words like "App" and "app" as the same token, removing unnecessary case sensitivity. Tokenization splits each review into individual words (tokens), making further cleaning and feature extraction easier.

In [6]:
# Tokenization and Lowercasing
df['Review'] = df['Review'].astype(str).str.lower()
df['Tokens'] = df['Review'].apply(word_tokenize)
df.head()

Unnamed: 0,Id,Name,Review,Date&Time,App,Sentiment,Non_ASCII_Flag,Tokens
0,111028,Jasper Ancajas,i have a hard time logging in so i uninstalled...,2020-05-28 19:28:00,Tinder,negative,False,"[i, have, a, hard, time, logging, in, so, i, u..."
1,211245,Tiger 181,i am getting no match and like...in stead of s...,2019-03-03 21:08:00,Tinder,negative,False,"[i, am, getting, no, match, and, like, ..., in..."
2,9624,Mr 21,no good,2021-02-03 21:38:00,Hinge,negative,False,"[no, good]"
3,198692,Tim Stone,"got banned for no reason, paid customer for ye...",2019-04-20 13:56:00,Tinder,negative,False,"[got, banned, for, no, reason, ,, paid, custom..."
4,27599,Thelma Barbara,the more i use this app the more i am appalled...,2021-08-31 17:19:00,Tinder,negative,False,"[the, more, i, use, this, app, the, more, i, a..."


Stopwords (like "the", "is", "on") are removed because they carry little meaning. A custom stopword list is used to keep negation words ("not", "no", "nor"), which are helpful to identify negative sentiment. Punctuation and digits are removed to focus purely on textual content. Stripping ensures that no tokens are just empty spaces, which can cause issues during vectorization.

In [7]:
# Removing Stop Words, Punctuation and Digits 
stop_words = set(stopwords.words('english'))

# Noticed how some necessary stopwords like 'not' is being removed, so I decided to customize own stopwords 
custom_stopwords = set(stop_words) - {"not", "no", "nor"}
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in custom_stopwords and word not in string.punctuation and not any (char.isdigit() for char in word)])

# Remove Whitespace
df['Tokens'] = df['Tokens'].apply(lambda tokens: [token.strip() for token in tokens if token.strip()])
df.head()

Unnamed: 0,Id,Name,Review,Date&Time,App,Sentiment,Non_ASCII_Flag,Tokens
0,111028,Jasper Ancajas,i have a hard time logging in so i uninstalled...,2020-05-28 19:28:00,Tinder,negative,False,"[hard, time, logging, uninstalled, app, not, i..."
1,211245,Tiger 181,i am getting no match and like...in stead of s...,2019-03-03 21:08:00,Tinder,negative,False,"[getting, no, match, like, ..., stead, spendin..."
2,9624,Mr 21,no good,2021-02-03 21:38:00,Hinge,negative,False,"[no, good]"
3,198692,Tim Stone,"got banned for no reason, paid customer for ye...",2019-04-20 13:56:00,Tinder,negative,False,"[got, banned, no, reason, paid, customer, year..."
4,27599,Thelma Barbara,the more i use this app the more i am appalled...,2021-08-31 17:19:00,Tinder,negative,False,"[use, app, appalled, incredible, number, scamm..."


To ensure proper normalization of tokens, lemmatization was applied using WordNetLemmatizer, with part-of-speech (POS) tagging to correctly identify verbs, nouns, adjectives, and adverbs. This improves the accuracy of lemmatization, avoiding cases where verbs like 'experiencing' remain unchanged due to being misclassified as nouns.

In [8]:
# Lemmatization with POS tagging
tag_dict = {
    "N": wordnet.NOUN,
    "V": wordnet.VERB,
    "J": wordnet.ADJ,
    "R": wordnet.ADV
}

# Return Wordnet POS that WordNetLemmatizer understands
def pos_tag_wordnet(text):
    tag = nltk.pos_tag([text])[0][1][0].upper()  
    return tag_dict.get(tag, wordnet.NOUN)  

lemmatizer = WordNetLemmatizer()

df['Tokens'] = df['Tokens'].apply(
    lambda tokens: [lemmatizer.lemmatize(token, pos_tag_wordnet(token)) for token in tokens]
)

df.head()

# # Lemmatization 
# lemmatizer = WordNetLemmatizer()
# df['Tokens'] = df['Tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
# df.head(10)


Unnamed: 0,Id,Name,Review,Date&Time,App,Sentiment,Non_ASCII_Flag,Tokens
0,111028,Jasper Ancajas,i have a hard time logging in so i uninstalled...,2020-05-28 19:28:00,Tinder,negative,False,"[hard, time, log, uninstalled, app, not, insta..."
1,211245,Tiger 181,i am getting no match and like...in stead of s...,2019-03-03 21:08:00,Tinder,negative,False,"[get, no, match, like, ..., stead, spending, n..."
2,9624,Mr 21,no good,2021-02-03 21:38:00,Hinge,negative,False,"[no, good]"
3,198692,Tim Stone,"got banned for no reason, paid customer for ye...",2019-04-20 13:56:00,Tinder,negative,False,"[get, ban, no, reason, paid, customer, year, ...."
4,27599,Thelma Barbara,the more i use this app the more i am appalled...,2021-08-31 17:19:00,Tinder,negative,False,"[use, app, appalled, incredible, number, scammer]"


In [9]:
# Drop 'Non_ASCII_Flag' column
df = df.drop('Non_ASCII_Flag', axis = 1)
df.head()

Unnamed: 0,Id,Name,Review,Date&Time,App,Sentiment,Tokens
0,111028,Jasper Ancajas,i have a hard time logging in so i uninstalled...,2020-05-28 19:28:00,Tinder,negative,"[hard, time, log, uninstalled, app, not, insta..."
1,211245,Tiger 181,i am getting no match and like...in stead of s...,2019-03-03 21:08:00,Tinder,negative,"[get, no, match, like, ..., stead, spending, n..."
2,9624,Mr 21,no good,2021-02-03 21:38:00,Hinge,negative,"[no, good]"
3,198692,Tim Stone,"got banned for no reason, paid customer for ye...",2019-04-20 13:56:00,Tinder,negative,"[get, ban, no, reason, paid, customer, year, ...."
4,27599,Thelma Barbara,the more i use this app the more i am appalled...,2021-08-31 17:19:00,Tinder,negative,"[use, app, appalled, incredible, number, scammer]"


In [10]:
#Run the polarity score on the entire dataset for EDA
sia = SentimentIntensityAnalyzer() 
res = {}

for i, row in tqdm(df.iterrows(), total = len(df)):
    text = row['Review']
    myid = row['Id'] 
    res[myid] = sia.polarity_scores(text)

  0%|          | 0/50000 [00:00<?, ?it/s]

In [11]:
vaders = pd.DataFrame(res).T

vaders = vaders.reset_index().rename(columns={'index':'Id'})

vaders = vaders.merge(df, how = 'left')

vaders.head()

Unnamed: 0,Id,neg,neu,pos,compound,Name,Review,Date&Time,App,Sentiment,Tokens
0,111028,0.074,0.926,0.0,-0.0516,Jasper Ancajas,i have a hard time logging in so i uninstalled...,2020-05-28 19:28:00,Tinder,negative,"[hard, time, log, uninstalled, app, not, insta..."
1,211245,0.167,0.833,0.0,-0.296,Tiger 181,i am getting no match and like...in stead of s...,2019-03-03 21:08:00,Tinder,negative,"[get, no, match, like, ..., stead, spending, n..."
2,9624,0.431,0.0,0.569,0.1779,Mr 21,no good,2021-02-03 21:38:00,Hinge,negative,"[no, good]"
3,198692,0.338,0.519,0.143,-0.4588,Tim Stone,"got banned for no reason, paid customer for ye...",2019-04-20 13:56:00,Tinder,negative,"[get, ban, no, reason, paid, customer, year, ...."
4,27599,0.18,0.755,0.065,-0.5057,Thelma Barbara,the more i use this app the more i am appalled...,2021-08-31 17:19:00,Tinder,negative,"[use, app, appalled, incredible, number, scammer]"


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         50000 non-null  int64 
 1   Name       50000 non-null  object
 2   Review     50000 non-null  object
 3   Date&Time  50000 non-null  object
 4   App        50000 non-null  object
 5   Sentiment  50000 non-null  object
 6   Tokens     50000 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.7+ MB


In [None]:
# SAVE THE Preprocessed DataFrame to a new CSV file
vaders.to_csv('dataset/preprocessed_DatingAppReviewsDataset.csv', index=False)