In [89]:
import pandas as pd
from sklearn.utils import shuffle

In [90]:
data_rt = pd.read_csv("C:/Users/Valentina/Desktop/UDSC/Sentiment analysis/Data/reviews_rt_all.csv", sep="|")
data_imdb = pd.read_csv("C:/Users/Valentina/Desktop/UDSC/Sentiment analysis/Data/imdb_small.csv", sep="|")

data_df = pd.concat([data_rt, data_imdb], ignore_index=True, copy=False)
data_df = shuffle(data_df)

text_data = data_df['text']

In [91]:
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer

In [94]:
txt = text_data.str.lower().str.cat(sep=' ')
words = nltk.word_tokenize(txt)
word_dist = nltk.FreqDist(words)
most_common_words = word_dist.most_common(25)
most_common_words 

[('the', 761112),
 (',', 643966),
 ('.', 569868),
 ('a', 389803),
 ('and', 381724),
 ('of', 348142),
 ('to', 304344),
 ('is', 251755),
 ('it', 219441),
 ('in', 210754),
 ('/', 202432),
 ('>', 202255),
 ('<', 202095),
 ('br', 201951),
 ('i', 178441),
 ('that', 165419),
 ('this', 165356),
 ("'s", 157691),
 ('as', 109198),
 ('with', 103162),
 ('was', 103158),
 ('for', 101396),
 ('but', 99448),
 ('movie', 95466),
 ('film', 92772)]

In [97]:
from nltk.stem import SnowballStemmer
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    word_list = nltk.word_tokenize(text)
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(word) for word in word_list]
    return stems

In [98]:
vectorizer = CountVectorizer(tokenizer=tokenize, stop_words = most_common_words)
data_features = vectorizer.fit_transform(text_data)

In [99]:
from sklearn.model_selection import train_test_split
X_train_rt, X_test_rt, y_train_rt, y_test_rt  = train_test_split(
        data_rt.text, 
        data_rt.label,
        test_size=0.2, 
        random_state=42)

In [100]:
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb  = train_test_split(
        data_imdb.text, 
        data_imdb.label,
        test_size=0.2, 
        random_state=42)

In [101]:
X_train = pd.concat([X_train_rt, X_train_imdb])
X_test = pd.concat([X_test_rt, X_test_imdb])
y_train = pd.concat([y_train_rt, y_train_imdb])
y_test = pd.concat([y_test_rt, y_test_imdb])

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('vectorizer', vectorizer), 
                     ('classifier', LogisticRegression())])
model = pipeline.fit(X=X_train, y=y_train)

In [103]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.80      0.74      0.77     12576
          1       0.83      0.87      0.85     17946

avg / total       0.82      0.82      0.81     30522



In [104]:
y_pred = model.predict(X_test_rt)

print(classification_report(y_test_rt, y_pred))


             precision    recall  f1-score   support

          0       0.73      0.64      0.68      7521
          1       0.81      0.87      0.83     13001

avg / total       0.78      0.78      0.78     20522



In [105]:
y_pred = model.predict(X_test_imdb)

print(classification_report(y_test_imdb, y_pred))

             precision    recall  f1-score   support

          0       0.89      0.88      0.89      5055
          1       0.88      0.89      0.88      4945

avg / total       0.88      0.88      0.88     10000

