In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'text':['people watch campus','campus watch campus','people write comment','campus write comment'],'output':[1,1,0,0]})

In [3]:
df

Unnamed: 0,text,output
0,people watch campus,1
1,campus watch campus,1
2,people write comment,0
3,campus write comment,0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [5]:
bow = cv.fit_transform(df['text'])

In [6]:
# vocab
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campus': 0, 'write': 4, 'comment': 1}


In [7]:
print(bow[0].toarray())
print(bow[1].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]


In [8]:
cv.transform(["campus watch and write comment of campus"]).toarray()

array([[2, 1, 0, 1, 1]], dtype=int64)

# N-grams

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [10]:
bow = cv.fit_transform(df['text'])

In [11]:
print(cv.vocabulary_)

{'people watch': 2, 'watch campus': 4, 'campus watch': 0, 'people write': 3, 'write comment': 5, 'campus write': 1}


In [12]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]
[[0 0 0 1 0 1]]
[[0 1 0 0 0 1]]


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,1))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

{'people': 2, 'watch': 3, 'campus': 0, 'write': 4, 'comment': 1}
[[1 0 1 1 0]]
[[2 0 0 1 0]]
[[0 1 1 0 1]]
[[1 1 0 0 1]]


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

{'people': 4, 'watch': 7, 'campus': 0, 'people watch': 5, 'watch campus': 8, 'campus watch': 1, 'write': 9, 'comment': 3, 'people write': 6, 'write comment': 10, 'campus write': 2}
[[1 0 0 0 1 1 0 1 1 0 0]]
[[2 1 0 0 0 0 0 1 1 0 0]]
[[0 0 0 1 1 0 1 0 0 1 1]]
[[1 0 1 1 0 0 0 0 0 1 1]]


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

{'people watch': 2, 'watch campus': 4, 'campus watch': 0, 'people write': 3, 'write comment': 5, 'campus write': 1}
[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]
[[0 0 0 1 0 1]]
[[0 1 0 0 0 1]]


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

{'people watch campus': 2, 'campus watch campus': 0, 'people write comment': 3, 'campus write comment': 1}
[[0 0 1 0]]
[[1 0 0 0]]
[[0 0 0 1]]
[[0 1 0 0]]


# Tf-idf

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['text']).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [18]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]
['campus' 'comment' 'people' 'watch' 'write']


# IMDB Dataset

In [29]:
import numpy as np
import pandas as pd

In [30]:
temp_df = pd.read_csv("C:\\Users\\yasha\\Downloads\\IMDB Dataset.csv\\IMDB Dataset.csv")
temp_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [31]:
df = temp_df.iloc[:10000]

In [32]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['review'][1]

In [None]:
df['sentiment'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# Basic preprocessing
# Remove tags
# Lowercase
# Remove stopwords

In [None]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'),'',raw_text)
    return cleaned_text

In [None]:
df['review'] = df['review'].apply(remove_tags)

In [None]:
df                 #Now there is no any html tags

In [None]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [None]:
df                  # Now all review are in lowercase

In [None]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: " ".join(x))

In [None]:
df

In [None]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [None]:
X

In [None]:
y

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [None]:
X_train_bow.shape

In [None]:
X_test_bow.shape

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_bow, y_train)

In [None]:
y_pred = gnb.predict(X_test_bow)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)


In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cv = CountVectorizer(max_features=3000)

In [None]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)


In [None]:
accuracy_score(y_test, y_pred)

In [None]:
cv = CountVectorizer(ngram_range=(1,2))
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

In [None]:
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

# Using Tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
tfidf = TfidfVectorizer()

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test,y_pred)