In [1]:
import pandas as pd
from sklearn.utils import shuffle

In [2]:
data_rt = pd.read_csv("C:/Users/Desktop/UDSC/Sentiment analysis/Data/reviews_rt_all.csv", sep="|")
data_imdb = pd.read_csv("C:/Users/Desktop/UDSC/Sentiment analysis/Data/imdb_small.csv", sep="|")

data_df = pd.concat([data_rt, data_imdb], ignore_index=True, copy=False)
data_df = shuffle(data_df)


In [3]:
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [4]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):    
    text = re.sub("[^a-zA-Z]", " ", text) 
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True 
)

In [None]:
data_features = vectorizer.fit_transform(data_df['text'])

In [6]:
from sklearn.model_selection import train_test_split
X_train_rt, X_test_rt, y_train_rt, y_test_rt  = train_test_split(
        data_rt.text, 
        data_rt.label,
        test_size=0.2, 
        random_state=42)

In [7]:
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb  = train_test_split(
        data_imdb.text, 
        data_imdb.label,
        test_size=0.2, 
        random_state=42)

In [8]:
X_train = pd.concat([X_train_rt, X_train_imdb])
X_test = pd.concat([X_test_rt, X_test_imdb])
y_train = pd.concat([y_train_rt, y_train_imdb])
y_test = pd.concat([y_test_rt, y_test_imdb])

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV


pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

#gridsearch parameters
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 2), (1, 3)),  
    'tfidf__use_idf': (True, False),
    'clf__C':[0.001, 0.01, 0.1, 1, 10, 100]   
}

grid = GridSearchCV(pipeline, cv=3, n_jobs=-1, param_grid=parameters)
model = grid.fit(X=X_train, y=y_train)

print(grid.best_params_)

{'clf__C': 100, 'tfidf__use_idf': True, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}


In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.83      0.79      0.81     12576
          1       0.86      0.89      0.87     17946

avg / total       0.84      0.84      0.84     30522



In [11]:
model = pipeline.fit(X=X_train_rt, y=y_train_rt)
y_pred = model.predict(X_test_rt)

print(classification_report(y_test_rt, y_pred))

             precision    recall  f1-score   support

          0       0.76      0.64      0.70      7521
          1       0.81      0.89      0.85     13001

avg / total       0.79      0.79      0.79     20522



In [12]:
y_pred = model.predict(X_test_imdb)

print(classification_report(y_test_imdb, y_pred))

             precision    recall  f1-score   support

          0       0.85      0.86      0.85      5055
          1       0.85      0.85      0.85      4945

avg / total       0.85      0.85      0.85     10000

