## **Preprocessing**

**1. Load the dataset.**

In [None]:
from google.colab import files #uploading text.csv file
uploaded = files.upload()

Saving text.csv to text (2).csv


In [None]:
import numpy as np
import pandas as pd

**2. Display the first 5 rows.**

In [None]:
df = pd.read_csv('text.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,label,review
0,0,neg,how do films like mouse hunt get into theatres...
1,1,neg,some talented actresses are blessed with a dem...
2,2,pos,this has been an extraordinary year for austra...
3,3,pos,according to hollywood movies made in last few...
4,4,neg,my first press screening of 1998 and already i...


# **Cleaning**

**3. Convert the text in the review column to lowercase**

In [None]:
text_col = df['review']

In [None]:
text_col = df['review'].str.lower()
text_col.head(5)

0    how do films like mouse hunt get into theatres...
1    some talented actresses are blessed with a dem...
2    this has been an extraordinary year for austra...
3    according to hollywood movies made in last few...
4    my first press screening of 1998 and already i...
Name: review, dtype: object

**4. Remove stopwords + 5. Remove punctuation signs.**

In [None]:
import re
import nltk
import spacy
from spacy import displacy
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')
stopword = nltk.corpus.stopwords.words('english')
def text_col_cleaning(text_col):
    text_col = re.sub(r'[^\w\s]', '',str(text_col))             #Punctuations
    text_col=re.split("\W+",text_col)                           #Tokenizing
    text_col=[word for word in text_col if word not in stopword]#Stop words
    text_col = ' '.join(text_col)
    return text_col

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
cleaned_col = text_col.apply(lambda x :text_col_cleaning(x))
cleaned_col.head(5)

0    films like mouse hunt get theatres isnt law so...
1    talented actresses blessed demonstrated wide a...
2    extraordinary year australian films shine scoo...
3    according hollywood movies made last decades l...
4    first press screening 1998 already ive gotten ...
Name: review, dtype: object

**6. Apply lemmatization**

In [None]:
import nltk
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
cleaned_col2 = cleaned_col.apply(lambda x: lemmatizer.lemmatize(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
cleaned_col2.head(5)

0    films like mouse hunt get theatres isnt law so...
1    talented actresses blessed demonstrated wide a...
2    extraordinary year australian films shine scoo...
3    according hollywood movies made last decades l...
4    first press screening 1998 already ive gotten ...
Name: review, dtype: object

**7. Remove rows that contain missing values**

In [None]:
df = df.dropna()

In [None]:
df['review'] = cleaned_col2

# **TF-IDF**

**8. Create positive sentiment and negative sentiment dataframes**

In [None]:
positive = df[df['label'] == 'pos']
negative = df[df['label'] == 'neg']

In [None]:
positive

Unnamed: 0.1,Unnamed: 0,label,review
2,2,pos,extraordinary year australian films shine scoo...
3,3,pos,according hollywood movies made last decades l...
11,11,pos,stars like sigourney weaver alien trilogy acad...
16,16,pos,remember hearing film first appeared cannes ye...
18,18,pos,garry shandling makes long overdue starring fi...
...,...,...,...
1995,1995,pos,like movies albert brooks really like movies d...
1996,1996,pos,might surprise know joel ethan coen brought un...
1997,1997,pos,verdict spinechilling drama horror maestro ste...
1998,1998,pos,want correct wrote former retrospective david ...


In [None]:
negative

Unnamed: 0.1,Unnamed: 0,label,review
0,0,neg,films like mouse hunt get theatres isnt law so...
1,1,neg,talented actresses blessed demonstrated wide a...
4,4,neg,first press screening 1998 already ive gotten ...
5,5,neg,put bluntly ed wood would proud totally ridicu...
6,6,neg,synopsis melissa mentallydisturbed woman likes...
...,...,...,...
1985,1985,neg,real blonde r womans face arm pumpedup pectora...
1986,1986,neg,following review contains spoilers way rapist...
1987,1987,neg,book remained shadows book shadows blair witc...
1991,1991,neg,right right get point despite similarities bes...


**9. Calculate the TF-IDF for positive cleaned reviews**

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cleaned_pos = positive['review']

In [None]:
count = CountVectorizer()
count1 = CountVectorizer()
word_count_pos=count.fit_transform(cleaned_pos)
print(word_count_pos)

  (0, 10789)	1
  (0, 33932)	4
  (0, 2370)	3
  (0, 11358)	2
  (0, 27279)	2
  (0, 26495)	1
  (0, 23003)	1
  (0, 11335)	5
  (0, 15632)	1
  (0, 2463)	1
  (0, 22569)	1
  (0, 3205)	4
  (0, 808)	1
  (0, 8457)	1
  (0, 10335)	1
  (0, 846)	1
  (0, 13223)	1
  (0, 17515)	1
  (0, 1611)	1
  (0, 6887)	1
  (0, 12098)	1
  (0, 13273)	1
  (0, 18305)	1
  (0, 23500)	1
  (0, 14306)	1
  :	:
  (981, 1928)	2
  (981, 31335)	1
  (981, 5897)	1
  (981, 9082)	1
  (981, 7876)	1
  (981, 18927)	1
  (981, 22691)	1
  (981, 11847)	1
  (981, 28720)	1
  (981, 6119)	1
  (981, 8346)	1
  (981, 13789)	1
  (981, 14158)	1
  (981, 9265)	1
  (981, 24545)	1
  (981, 4878)	1
  (981, 21645)	1
  (981, 356)	1
  (981, 23175)	1
  (981, 4877)	2
  (981, 31350)	2
  (981, 19436)	2
  (981, 29737)	1
  (981, 9319)	1
  (981, 22769)	1


In [None]:
word_count_pos.shape

(982, 34158)

In [None]:
print(word_count_pos.toarray())

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
pd.DataFrame(word_count_pos.toarray(), columns=count.get_feature_names_out())

Unnamed: 0,00,000,0009f,000aweek,007,05,10,100,1000,10000,...,zuehlke,zuko,zukovsky,zundel,zurgs,zweibel,zwick,zwicks,zwigoffs,zycie
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
979,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
980,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_pos)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=count.get_feature_names_out(),columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
film,1.138339
one,1.144197
movie,1.338101
like,1.355368
time,1.488692
...,...
interfering,7.197462
interferes,7.197462
interfered,7.197462
interminably,7.197462


In [None]:
#tfidf
tf_idf_vector=tfidf_transformer.transform(word_count_pos) #transforms the count matrix word_count into a TF-IDF matrix using the fitted tfidf_transformer.
feature_names = count.get_feature_names_out() #retrives the words
first_document_vector=tf_idf_vector[0] #represents the TF-IDF values for each term in that document.
df_tfifd= pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
# Terms with higher TF-IDF values are considered more important.
df_tfifd.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
australia,0.229268
aboriginal,0.217377
payback,0.189703
brown,0.174577
white,0.163403
...,...
firecrackers,0.000000
firebreathing,0.000000
firebird,0.000000
fireballs,0.000000


**9. Calculate the TF-IDF for negative cleaned reviews**

In [None]:
cleaned_neg = negative['review']

In [None]:
count2 = CountVectorizer()
#The resulting word_count variable will be a sparse matrix where each row represents a document,
#and each column represents a unique word in the corpus.
#The values in the matrix represent the count of each word in each document.
word_count_neg=count.fit_transform(cleaned_neg)
# word_count1 = count1.fit_transform(new)
print(word_count_neg)

  (0, 10516)	1
  (0, 16417)	2
  (0, 18505)	6
  (0, 13749)	3
  (0, 11671)	1
  (0, 28447)	1
  (0, 14894)	1
  (0, 16074)	1
  (0, 26374)	1
  (0, 7714)	1
  (0, 16580)	1
  (0, 5241)	1
  (0, 27079)	1
  (0, 26595)	1
  (0, 8511)	2
  (0, 27386)	1
  (0, 13390)	1
  (0, 10147)	1
  (0, 10189)	1
  (0, 7118)	1
  (0, 31569)	1
  (0, 28053)	1
  (0, 2559)	1
  (0, 28594)	1
  (0, 21312)	2
  :	:
  (982, 25807)	1
  (982, 17325)	1
  (982, 29160)	1
  (982, 8687)	1
  (982, 29130)	1
  (982, 22282)	1
  (982, 10767)	1
  (982, 6456)	1
  (982, 13381)	1
  (982, 26487)	1
  (982, 22401)	1
  (982, 448)	1
  (982, 19756)	1
  (982, 16063)	1
  (982, 21113)	1
  (982, 15331)	1
  (982, 274)	1
  (982, 16501)	2
  (982, 29410)	1
  (982, 1141)	1
  (982, 27083)	2
  (982, 10830)	1
  (982, 24118)	1
  (982, 5473)	1
  (982, 10426)	1


In [None]:
word_count_neg.shape

(983, 31926)

In [None]:
print(word_count_neg.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
pd.DataFrame(word_count_neg.toarray(), columns=count.get_feature_names_out())

Unnamed: 0,00,000,000acre,000foot,000paltry,007,010,03,04,05,...,zoomouts,zooms,zoot,zorro,zsigmond,zucker,zulu,zwick,zwicks,zwigoffs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
979,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
981,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
#calculates the IDF weights based on the counts of each term in the documents.
tfidf_transformer.fit(word_count_neg)
# index is the feature names obtained from count.get_feature_names_out()
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=count.get_feature_names_out(),columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
one,1.166592
film,1.187212
movie,1.245235
like,1.293117
even,1.420826
...,...
integrate,7.198479
intaking,7.198479
insurrections,7.198479
insouciance,7.198479


**10. 10 most important words in each dataset**

10 most important words in positive dataset: australia, aboriginal, payback, brown, white, firecrackers,
firebreathing,
firebird,
fireballs,
zycie

10 most important words in negative dataset: one, film, movie, like, even, integrate, intaking, insurrections, insouciance, zwigoffs