In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load data
df = pd.read_csv('sentiment_data.csv')

# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = re.sub('[^a-zA-Z]', ' ', text) # remove non-alphabetic characters
    text = text.lower() # convert text to lowercase
    words = text.split() # split text into words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] # lemmatize words and remove stop words
    return ' '.join(words) # join the words back into a single string

df['review'] = df['review'].apply(preprocess)

# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


In [1]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# CSV - Comma Separated Values - id,name,age
# TSV - Tab Separated Values - id   name   age
imdb_reviews = pd.read_csv('reviews/imdb_labelled.txt', sep="\t", header=None)

In [3]:
imdb_reviews.head()

Unnamed: 0,0,1
0,A very very very slow moving aimless movie abo...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
amazon_reviews = pd.read_csv('reviews/amazon_cells_labelled.txt', sep="\t", header=None)
yelp_reviews = pd.read_csv('reviews/yelp_labelled.txt', sep="\t", header=None)

In [5]:
imdb_reviews.shape

(1000, 2)

In [6]:
amazon_reviews.shape

(1000, 2)

In [7]:
yelp_reviews.shape

(1000, 2)

In [8]:
df = pd.DataFrame()
df = df.append(imdb_reviews).append(yelp_reviews).append(amazon_reviews)

In [9]:
df.shape

(3000, 2)

In [10]:
df.head()

Unnamed: 0,0,1
0,A very very very slow moving aimless movie abo...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [11]:
df.columns = ['Review', 'Sentiment']

In [12]:
df.head()

Unnamed: 0,Review,Sentiment
0,A very very very slow moving aimless movie abo...,0
1,Not sure who was more lost the flat character...,0
2,Attempting artiness with black white and clev...,0
3,Very little music or anything to speak of,0
4,The best scene in the movie was when Gerardo i...,1


In [13]:
def textProcessing(df):
    table = str.maketrans('','',string.punctuation)
    for i in range(len(df)):
        df['Review'].iloc[i] = df['Review'].iloc[i].lower().translate(table)
    
    documents = []
    # word tokenization
    for i in range(len(df)):
        documents.append(word_tokenize(df['Review'].iloc[i]))
        
    englishStopwords = stopwords.words("english")
    words = []
    for tokens in documents:
        word = []
        for i in range(len(tokens)):
            if tokens[i] not in englishStopwords:
                word.append(tokens[i])
        words.append(word)
        
    wnet = WordNetLemmatizer()
    for i in range(len(words)):
        for j in range(len(words[i])):
            words[i][j] = wnet.lemmatize(words[i][j], 'v')
            
    for i in range(len(words)):
        words[i] = " ".join(words[i])
    
    return words

In [14]:
words = textProcessing(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
words[:5]

['slow move aimless movie distress drift young man',
 'sure lose flat character audience nearly half walk',
 'attempt artiness black white clever camera angle movie disappoint become even ridiculous act poor plot line almost nonexistent',
 'little music anything speak',
 'best scene movie gerardo try find song keep run head']

In [16]:
len(words)

3000

In [17]:
tfidf = TfidfVectorizer()

In [18]:
vector = tfidf.fit_transform(words)

In [19]:
vector

<3000x4531 sparse matrix of type '<class 'numpy.float64'>'
	with 18125 stored elements in Compressed Sparse Row format>

In [20]:
vector = vector.toarray()

In [21]:
vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
x_train, x_test, y_train, y_test = train_test_split(vector, df['Sentiment'], test_size=0.25)

In [23]:
x_train.shape

(2250, 4531)

In [24]:
x_test.shape

(750, 4531)

In [25]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
y_pred = logistic.predict(x_test)

In [27]:
accuracy_score(y_test, y_pred)

0.8013333333333333

In [28]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
y_pred = nb.predict(x_test)
accuracy_score(y_test, y_pred)

0.78

In [30]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)

Confusion matrix:
[[300  97]
 [ 68 285]]


In [31]:
new_review = "It is specially good for gaming although battery backup is not that good if you are a heavy user"

In [32]:
new_review_transformed = tfidf.transform([new_review])

In [33]:
new_review_sentiment = nb.predict(new_review_transformed)[0]

In [34]:
print("Sentiment:", "positive" if new_review_sentiment else "negative")

Sentiment: positive
