In [2]:
print('hello')

hello


In [1]:
import pandas as pd

try:
    df = pd.read_csv('whatsapp_reviews_relevant.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('whatsapp_reviews_relevant.csv', encoding='latin-1')
    except Exception as e:
        print(f"Error loading the file: {e}")
        df = None

if df is not None:
    text_column = 'content' # Assuming the column name is 'content' for the reviews

    display(df.head())
    print(f"DataFrame shape: {df.shape}")
    print(f"Text column: {text_column}")

Unnamed: 0,user,review,rating
0,Willem van Heerden,A couple of annoying bugs have crept in recent...,1
1,Oscar Q,I was really hoping for a smooth messaging exp...,1
2,Mike Moritz,Updates always break functionality while addin...,1
3,astroturtle,FIX THE BUG THAT FORGETS YOUR LOGIN AND FORCES...,1
4,A Google user,"I noticed that recently after updates, when li...",2


DataFrame shape: (500, 3)
Text column: content


<h1>Preprocessing</h1>
<h2>Case folding</h2>

In [2]:
df['review'] = df['review'].str.lower()
print(df['review'].head())

0    a couple of annoying bugs have crept in recent...
1    i was really hoping for a smooth messaging exp...
2    updates always break functionality while addin...
3    fix the bug that forgets your login and forces...
4    i noticed that recently after updates, when li...
Name: review, dtype: object


<h2>Tokenize</h2>

In [4]:
import nltk
from nltk.tokenize import word_tokenize

# Make sure the necessary NLTK data is downloaded
nltk.download('punkt')

# Define a simple tokenization function
def tokenize_text(text):
    if isinstance(text, str):
        return word_tokenize(text)
    else:
        return []

# Apply the tokenization function to the 'review' column
df['review_tokenized'] = df['review'].apply(tokenize_text)

# # Display the tokenized reviews
# print(df[['review', 'review_tokenized']].head())
 
# If you're using a notebook and want to display the dataframe
display(df['review_tokenized'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0      [a, couple, of, annoying, bugs, have, crept, i...
1      [i, was, really, hoping, for, a, smooth, messa...
2      [updates, always, break, functionality, while,...
3      [fix, the, bug, that, forgets, your, login, an...
4      [i, noticed, that, recently, after, updates, ,...
                             ...                        
495    [whatsapp, has, a, serious, issue, where, call...
496    [v., good, ., needs, notification, sound, opti...
497    [dear, whatsapp, team, ,, i, hope, this, email...
498    [i, use, whatsapp, for, a, long, time, but, wh...
499    [whatsapp, messenger, is, a, great, app, with,...
Name: review_tokenized, Length: 500, dtype: object

<h2>Stopword removal</h2>

In [5]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK data
# nltk.download('punkt')
nltk.download('stopwords')

# Define function to remove stopwords
def remove_stopwords(text):
    if not isinstance(text, str):
        return ""
    
    # Get English stopwords from NLTK
    stop_words = set(stopwords.words('english'))
    
    # Tokenize the text (needed for stopword removal)
    word_tokens = word_tokenize(text)
    
    # Remove stopwords and join back into text
    filtered_text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])
    
    return filtered_text

# Apply stopword removal to reviews
df['no_stopword'] = df['review'].apply(remove_stopwords)

# Display the results
print(df[['review', 'no_stopword']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


                                              review  \
0  a couple of annoying bugs have crept in recent...   
1  i was really hoping for a smooth messaging exp...   
2  updates always break functionality while addin...   
3  fix the bug that forgets your login and forces...   
4  i noticed that recently after updates, when li...   

                                         no_stopword  
0  couple annoying bugs crept recently , 's impos...  
1  really hoping smooth messaging experience , wh...  
2  updates always break functionality adding `` f...  
3  fix bug forgets login forces restore everythin...  
4  noticed recently updates , listening voice not...  


In [6]:
df.head()

Unnamed: 0,user,review,rating,review_tokenized,no_stopword
0,Willem van Heerden,a couple of annoying bugs have crept in recent...,1,"[a, couple, of, annoying, bugs, have, crept, i...","couple annoying bugs crept recently , 's impos..."
1,Oscar Q,i was really hoping for a smooth messaging exp...,1,"[i, was, really, hoping, for, a, smooth, messa...","really hoping smooth messaging experience , wh..."
2,Mike Moritz,updates always break functionality while addin...,1,"[updates, always, break, functionality, while,...",updates always break functionality adding `` f...
3,astroturtle,fix the bug that forgets your login and forces...,1,"[fix, the, bug, that, forgets, your, login, an...",fix bug forgets login forces restore everythin...
4,A Google user,"i noticed that recently after updates, when li...",2,"[i, noticed, that, recently, after, updates, ,...","noticed recently updates , listening voice not..."


<h2>Stemming</h2>

In [7]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Create stemmer
stemmer = PorterStemmer()

# Define function to apply stemming
def stem_text(text):
    if not isinstance(text, str):
        return ""
    
    # Tokenize the text first (needed for stemming individual words)
    word_tokens = word_tokenize(text)
    
    # Apply stemming to each word and join back into text
    stemmed_text = ' '.join([stemmer.stem(word) for word in word_tokens])
    
    return stemmed_text

# Apply stemming to reviews
df['stemmed_review'] = df['review'].apply(stem_text)

# Display the results
print(df[['review', 'stemmed_review']].head())

                                              review  \
0  a couple of annoying bugs have crept in recent...   
1  i was really hoping for a smooth messaging exp...   
2  updates always break functionality while addin...   
3  fix the bug that forgets your login and forces...   
4  i noticed that recently after updates, when li...   

                                      stemmed_review  
0  a coupl of annoy bug have crept in recent , an...  
1  i wa realli hope for a smooth messag experi , ...  
2  updat alway break function while ad `` featur ...  
3  fix the bug that forget your login and forc yo...  
4  i notic that recent after updat , when listen ...  


In [8]:
df.head()

Unnamed: 0,user,review,rating,review_tokenized,no_stopword,stemmed_review
0,Willem van Heerden,a couple of annoying bugs have crept in recent...,1,"[a, couple, of, annoying, bugs, have, crept, i...","couple annoying bugs crept recently , 's impos...","a coupl of annoy bug have crept in recent , an..."
1,Oscar Q,i was really hoping for a smooth messaging exp...,1,"[i, was, really, hoping, for, a, smooth, messa...","really hoping smooth messaging experience , wh...","i wa realli hope for a smooth messag experi , ..."
2,Mike Moritz,updates always break functionality while addin...,1,"[updates, always, break, functionality, while,...",updates always break functionality adding `` f...,updat alway break function while ad `` featur ...
3,astroturtle,fix the bug that forgets your login and forces...,1,"[fix, the, bug, that, forgets, your, login, an...",fix bug forgets login forces restore everythin...,fix the bug that forget your login and forc yo...
4,A Google user,"i noticed that recently after updates, when li...",2,"[i, noticed, that, recently, after, updates, ,...","noticed recently updates , listening voice not...","i notic that recent after updat , when listen ..."


<h2>Lemmatize</h2>

In [9]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('wordnet')

# Create lemmatizer
lemmatizer = WordNetLemmatizer()

# Define function to apply lemmatization
def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    
    # Tokenize the text first (needed for lemmatizing individual words)
    word_tokens = word_tokenize(text)
    
    # Apply lemmatization to each word and join back into text
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokens])
    
    return lemmatized_text

# Apply lemmatization to reviews
df['lemmatized_review'] = df['review'].apply(lemmatize_text)

# Display the results
print(df[['review', 'lemmatized_review']].head())

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              review  \
0  a couple of annoying bugs have crept in recent...   
1  i was really hoping for a smooth messaging exp...   
2  updates always break functionality while addin...   
3  fix the bug that forgets your login and forces...   
4  i noticed that recently after updates, when li...   

                                   lemmatized_review  
0  a couple of annoying bug have crept in recentl...  
1  i wa really hoping for a smooth messaging expe...  
2  update always break functionality while adding...  
3  fix the bug that forgets your login and force ...  
4  i noticed that recently after update , when li...  


In [10]:
df.head()

Unnamed: 0,user,review,rating,review_tokenized,no_stopword,stemmed_review,lemmatized_review
0,Willem van Heerden,a couple of annoying bugs have crept in recent...,1,"[a, couple, of, annoying, bugs, have, crept, i...","couple annoying bugs crept recently , 's impos...","a coupl of annoy bug have crept in recent , an...",a couple of annoying bug have crept in recentl...
1,Oscar Q,i was really hoping for a smooth messaging exp...,1,"[i, was, really, hoping, for, a, smooth, messa...","really hoping smooth messaging experience , wh...","i wa realli hope for a smooth messag experi , ...",i wa really hoping for a smooth messaging expe...
2,Mike Moritz,updates always break functionality while addin...,1,"[updates, always, break, functionality, while,...",updates always break functionality adding `` f...,updat alway break function while ad `` featur ...,update always break functionality while adding...
3,astroturtle,fix the bug that forgets your login and forces...,1,"[fix, the, bug, that, forgets, your, login, an...",fix bug forgets login forces restore everythin...,fix the bug that forget your login and forc yo...,fix the bug that forgets your login and force ...
4,A Google user,"i noticed that recently after updates, when li...",2,"[i, noticed, that, recently, after, updates, ,...","noticed recently updates , listening voice not...","i notic that recent after updat , when listen ...","i noticed that recently after update , when li..."
