In [1]:
import pandas as pd
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ubeydgur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("../data/imdb-dataset.csv")
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
documents = df["review"]
lables = df["sentiment"]

In [4]:
def clean_text(text):
    text = text.lower()

    text = re.sub(r"[^A-Za-z\s]", "", text)

    text = " ".join([word for word in text.split() if word not in stop_words and len(word) > 2])

    return text

In [5]:
cleaned_doc = [clean_text(row) for row in documents]
cleaned_doc[:2]

['one reviewers mentioned watching episode youll hooked right exactly happened mebr first thing struck brutality unflinching scenes violence set right word trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordbr called nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awaybr would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered middle class inmates turne

In [6]:
vectorizer = CountVectorizer()

In [7]:
X = vectorizer.fit_transform(cleaned_doc[:1000])
feature_names = vectorizer.get_feature_names_out()

In [8]:
X_array = X.toarray()

In [9]:
df_bow = pd.DataFrame(X_array, columns=feature_names)

In [12]:
df_bow.head(10)

Unnamed: 0,aaargh,aaliyah,aamir,aaron,abandon,abandoned,abandons,abba,abbey,abbeys,...,zone,zoo,zoology,zoom,zooming,zooms,zucker,zulu,zwick,zzzzzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
word_frequencies_sum = df_bow.sum(axis=0)
word_frequencies_sum

aaargh                1
aaliyah               1
aamir                 1
aaron                 2
abandon               2
                     ..
zooms                 4
zucker                2
zulu                  1
zwick                 1
zzzzzzzzzzzzzzzzzz    1
Length: 20328, dtype: int64

In [16]:
word_word = dict(zip(feature_names, word_frequencies_sum))

In [15]:
most_common = Counter(word_word).most_common(5)
most_common

[('movie', 1703), ('film', 1473), ('one', 959), ('like', 773), ('good', 520)]