In [1]:
import pandas as pd
import nltk

## Load Dataset

In [2]:
data = pd.read_csv(
    "sentiment_tweets3.csv")
print(data.shape)
data.head()


(20001, 3)


Unnamed: 0,review_label,ID,review_text
0,2,B004A9SDD8,"Loves the song, so he really couldn't wait to ..."
1,3,B004A9SDD8,"Oh, how my little grandson loves this app. He'..."
2,3,B004A9SDD8,I found this at a perfect time since my daught...
3,3,B004A9SDD8,My 1 year old goes back to this game over and ...
4,3,B004A9SDD8,There are three different versions of the song...


In [3]:
train_df.isna().sum()

review_label    0
ID              0
review_text     0
dtype: int64

In [4]:
test_df = pd.read_csv("test.txt", sep="\t", names=["review_label", "ID", "review_text"])
print(test_df.shape)
test_df.head()


(19999, 3)


Unnamed: 0,review_label,ID,review_text
0,3,B004K4RY9M,I am a person who has always enjoyed word game...
1,3,B004K4RY9M,Love this. I try to beat my own time to see h...
2,3,B004K4RY9M,This game is fun and it can also be alearning ...
3,3,B004K4RY9M,I enjoy these puzzles have books of them keep ...
4,3,B004K4RY9M,Have spent many enjoyable hours playing this g...


In [5]:
test_df.isna().sum()

review_label    0
ID              0
review_text     0
dtype: int64

In [22]:
#check the sahpe of your test and train dataset
print(train_df.shape)
print(test_df.shape)

(20001, 4)
(19999, 4)


In [6]:
# Add a new column indicating which dataset each record belongs to
train_df['dataset'] = 0
test_df['dataset'] = 1

In [7]:
# Combine the datasets
combined_df = pd.concat([train_df, test_df], ignore_index=True)

## Data Preprocessing

- **Convert to lowercase**
- **Remove special characters**
- **Remove stopwords**
- **Lemmatization**
- 

In [8]:
import string

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords') 

nltk.download('wordnet') 
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ve797\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ve797\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ve797\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ve797\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Remove special characters

In [9]:
stop_words = stopwords.words('english') 
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
punctuation_list = list(string.punctuation)
print(punctuation_list)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [11]:
useless_words = stop_words + punctuation_list

In [12]:
# remove punctuation and stopwords in review_text column

combined_df["review_text"] = combined_df["review_text"].str.lower().str.replace("[^\w\s]", "", regex=True).apply(
    lambda x: " ".join(word for word in word_tokenize(x) if word not in useless_words)
)

combined_df.head()


Unnamed: 0,review_label,ID,review_text,dataset
0,2,B004A9SDD8,loves song really couldnt wait play little les...,0
1,3,B004A9SDD8,oh little grandson loves app hes always asking...,0
2,3,B004A9SDD8,found perfect time since daughters favorite so...,0
3,3,B004A9SDD8,1 year old goes back game simple easy toddler ...,0
4,3,B004A9SDD8,three different versions song game keeps occup...,0


### Lemmatization

In [13]:
# apply lemmatization to review_text column

lemmatizer = WordNetLemmatizer()

combined_df["review_text"] = combined_df["review_text"].apply(
    lambda x: " ".join(lemmatizer.lemmatize(word) for word in word_tokenize(x))
)

combined_df.head()

Unnamed: 0,review_label,ID,review_text,dataset
0,2,B004A9SDD8,love song really couldnt wait play little le i...,0
1,3,B004A9SDD8,oh little grandson love app he always asking m...,0
2,3,B004A9SDD8,found perfect time since daughter favorite son...,0
3,3,B004A9SDD8,1 year old go back game simple easy toddler us...,0
4,3,B004A9SDD8,three different version song game keep occupie...,0


In [14]:
combined_df["review_text"].apply(
    lambda x: " ".join(tag for _, tag in nltk.pos_tag(word_tokenize(x)))
)

0        NN NN RB JJ NN NN JJ JJ VBG JJ NN RB RB CD IN ...
1        RB JJ NN VB NN PRP RB VBG JJ NNS VBD JJ JJ NN ...
2        VBN JJ NN IN NN NN NN NN CD JJ NN JJ JJ CD NN ...
3        CD NN JJ VBP RB NN JJ JJ NN NN JJ RB VBD CD NN...
4        CD JJ NN NN NN VB VBD NN NN NN NN VBP IN NN NN...
                               ...                        
39995               NN NN NN NN JJ JJ NN JJ NN NN JJ NN NN
39996    RB RB VB JJ JJ JJ NN NN NN NN NN RB JJ NN NN N...
39997    VB NN JJ NN JJ JJ NN DT NN VBZ JJ JJ VB UH JJ ...
39998          RB JJ NN NN VBG NN RB JJ NN NN VBG NN NN NN
39999    JJ JJ NN VBD VBD RB JJ NN MD RB VB RB JJ NN NN...
Name: review_text, Length: 40000, dtype: object

In [15]:
# Split the combined dataset back into the original datasets
train_processed_df = combined_df.loc[combined_df['dataset'] == 0].drop(columns=['dataset'])
test_processed_df = combined_df.loc[combined_df['dataset'] == 1].drop(columns=['dataset'])



In [23]:
#check the sahpe of your test and train dataset
print(train_df.shape)
print(test_df.shape)

(20001, 4)
(19999, 4)


In [17]:
# Verify that the original datasets are intact
print(train_df.equals(train_processed_df))
print(test_df.equals(test_processed_df))

False
False


In [19]:
# Verify that the original datasets are intact
train_processed_df.head()

Unnamed: 0,review_label,ID,review_text
0,2,B004A9SDD8,love song really couldnt wait play little le i...
1,3,B004A9SDD8,oh little grandson love app he always asking m...
2,3,B004A9SDD8,found perfect time since daughter favorite son...
3,3,B004A9SDD8,1 year old go back game simple easy toddler us...
4,3,B004A9SDD8,three different version song game keep occupie...


In [20]:
# Verify that the original datasets are intact
test_processed_df.head()

Unnamed: 0,review_label,ID,review_text
20001,3,B004K4RY9M,person always enjoyed word game thiis one exce...
20002,3,B004K4RY9M,love try beat time see fast complete keep mind...
20003,3,B004K4RY9M,game fun also alearning game recomend age 7 go...
20004,3,B004K4RY9M,enjoy puzzle book keep entertained hour great ...
20005,3,B004K4RY9M,spent many enjoyable hour playing game would r...
