In [1]:
# importing the Dataset
import pandas as pd
reviews = pd.read_csv('/home/abhisek/Documents/Python/imdb_master.csv',usecols=["review", "label"],
                      encoding='ISO-8859-1')

In [2]:
reviews

Unnamed: 0,review,label
0,Once again Mr. Costner has dragged out a movie...,neg
1,This is an example of why the majority of acti...,neg
2,"First of all I hate those moronic rappers, who...",neg
3,Not even the Beatles could write songs everyon...,neg
4,Brass pictures (movies is not a fitting word f...,neg
...,...,...
99995,"Delightfully awful! Made by David Giancola, a ...",unsup
99996,"Watching Time Chasers, it obvious that it was ...",unsup
99997,At the beginning we can see members of Troma t...,unsup
99998,"The movie was incredible, ever since I saw it ...",unsup


## Text Preprocessing

In [3]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abhisek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemma=WordNetLemmatizer()

In [10]:
# For Stemming
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', reviews['review'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

## Or

In [5]:
# For Lemmatizer
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', reviews['review'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus

['mr costner dragged movie far longer necessary aside terrific sea rescue sequence care character u ghost closet costner character realized early forgotten much later time care character really care cocky overconfident ashton kutcher problem come kid think better anyone else around show sign cluttered closet obstacle appears winning costner finally well past half way point stinker costner tell u kutcher ghost told kutcher driven best prior inkling foreshadowing magic could keep turning hour',
 'example majority action film generic boring really nothing worth watching complete waste barely tapped talent ice ice cube proven many time capable acting acting well bother one go see new jack city ricochet watch new york undercover ice boyz n hood higher learning friday ice cube see real deal ice horribly cliched dialogue alone make film grate teeth still wondering heck bill paxton film heck always play exact character alien onward every film seen bill paxton playing exact irritating character

## Creating the Bag of Words model

In [7]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [10]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
y=pd.get_dummies(reviews['label'])
y=y.iloc[:,1].values

In [19]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [20]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [26]:
from sklearn.ensemble import RandomForestClassifier
review_detect_model = RandomForestClassifier().fit(X_train, y_train)

In [27]:
#prediction
y_pred=review_detect_model .predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)

0.75335


In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86     19526
           1       0.06      0.60      0.10       474

    accuracy                           0.75     20000
   macro avg       0.52      0.68      0.48     20000
weighted avg       0.97      0.75      0.84     20000



## Creating the TFIDF model