In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
from nltk import pos_tag

In [2]:
reviews = pd.read_csv('IMDB Dataset.csv')

reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
reviews['review'] = reviews['review'].str.lower()

reviews.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [4]:
stopword = stopwords.words('english')

punctuations = string.punctuation

In [5]:
def removePunctuations(text):
     
    #remove punctuations
    no_punctuation = ''.join([char for char in text if char not in punctuations])
    
    return no_punctuation

In [6]:
reviews['review'] = reviews['review'].apply(lambda x: removePunctuations(x))

reviews.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [7]:
reviews['review'] = reviews['review'].apply(lambda x: word_tokenize(x))

In [8]:
def removeStopwords(tokenized_list):
    
    text = [word for word in tokenized_list if word not in stopword]
    
    return text

In [9]:
reviews['review'] = reviews['review'].apply(lambda x: removeStopwords(x))

reviews.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, br, br, filmin...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive


In [10]:
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    
    if pos_tag.startswith('J'):
        
        return wordnet.ADJ
    
    elif pos_tag.startswith('V'):
        
        return wordnet.VERB
    
    elif pos_tag.startswith('N'):
    
        return wordnet.NOUN
    
    elif pos_tag.startswith('R'):
    
        return wordnet.ADV
    
    else:
        
        return wordnet.NOUN

In [11]:
wordnets = WordNetLemmatizer()

def lemmatizing(tokenized_list):
    
    text = [wordnets.lemmatize(word, get_wordnet_pos(word)) for word in tokenized_list]
    
    return text

In [12]:
reviews['review'] = reviews['review'].apply(lambda x: str(lemmatizing(x)))

In [13]:
vectorizer = TfidfVectorizer()

vectorizer.fit(reviews['review'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [14]:
feature_train, feature_test, label_train, label_test  = train_test_split(reviews['review'], reviews['sentiment'], test_size=0.2, random_state=25)

In [15]:
feature_train_vect = vectorizer.transform(feature_train)

feature_test_vect = vectorizer.transform(feature_test)

In [16]:
model  = GradientBoostingClassifier()

In [17]:
#model.fit(feature_train_vect, label_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [18]:
model.score(feature_test_vect, label_test)

0.8076

In [19]:
ypred = model.predict(feature_test_vect)

print(len(ypred))

10000


In [20]:
(unique, counts) = np.unique(ypred, return_counts=True)

frequency = np.asarray((unique, counts))

print(frequency)

[['negative' 'positive']
 [4533 5467]]


In [21]:
values = list(frequency[1])

familiarity = values[0]/len(ypred) * 5

print(familiarity)

2.2664999999999997


In [22]:
interactivity = (values[1]/len(ypred) * 5) + 5

print(interactivity)

7.733499999999999


In [23]:
model1 = RandomForestClassifier(n_estimators = 200, n_jobs=-1)

In [24]:
model1.fit(feature_train_vect, label_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [25]:
model1.score(feature_test_vect, label_test)

0.8577

In [26]:
ypred1 = model1.predict(feature_test_vect)

print(len(ypred1))

10000


In [27]:
(unique, counts) = np.unique(ypred1, return_counts=True)

frequency = np.asarray((unique, counts))

print(frequency)

[['negative' 'positive']
 [5050 4950]]


In [28]:
values = list(frequency[1])

familiarity = values[0]/len(ypred1) * 5

print(familiarity)

2.525


In [29]:
interactivity = (values[1]/len(ypred1) * 5) + 5

print(interactivity)

7.475


In [30]:
# save the model and load for prediction

joblib.dump(model1, 'fans_sentiment_scale.joblib')

['fans_sentiment_scale.joblib']