# Yelp Review Sentiment Analysis (NLTK & SkLearn)

## Introductory Work

Import Libraries

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import pandas as pd
import string
import re

Read the data

In [None]:
df = pd.read_table("yelp_labelled.txt", header=None)
df.columns = ["Text", "Sentiment"]
df.head()

## Processing The Text 

Remove punctuation and make all characters lower case

In [None]:
def rmPunc(x):
    res = ""
    for char in x:
        if char.isalnum() or char==" ":
            res+=char.lower()
    return res

df["punc"] = df["Text"].apply(lambda x: rmPunc(x))
df.head()

Tokenize the text 

In [None]:
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
df["tokenized"] = df["punc"].apply(lambda x: word_tokenize(x))
df.head()

Remove the stop words 

In [None]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
def rmStop(x):
    res = []
    for word in x:
        if word not in stopWords:
            res.append(word)
    return res
df["rmStop"] = df["tokenized"].apply(lambda x: rmStop(x))
df.head()

Apply lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
def lem(x):
    res = []
    for word in x:
        res.append(WordNetLemmatizer().lemmatize(word))
    return res
df["lemmatized"] = df["rmStop"].apply(lambda x: lem(x))
df.head()

## Feature Engineering

Length of text

In [None]:
df['len'] = df["Text"].apply(lambda x: len(x))
df.head()

Number of non-stopwords

In [None]:
df['numUseful'] = df["rmStop"].apply(lambda x: len(x))
df.head()

Number of capitals

In [None]:
import string
def caps(x):
    count = 0
    for char in x:
        if char.isupper():
            count+=1
    return count
df['caps'] = df["Text"].apply(lambda x: caps(x))
df.head()

Number of punctuation

In [None]:
df["numPunc"] = df["Text"].apply(lambda x: len(x)) - df["punc"].apply(lambda x: len(x))
df.head()

Create a dataframe with only positive reviews

In [None]:
pos = df[df["Sentiment"] == 1]
pos.head()

Create a dataframe with only negative reviews

In [None]:
neg = df[df["Sentiment"] == 0]
neg.head()

Compare length of text between positive and negative reviews

In [None]:
plt.hist(pos["len"],np.linspace(0,160,40),density=True, alpha=0.4, label="Positive")
plt.hist(neg["len"],np.linspace(0,160,40),density=True, alpha=0.4, label="Negative")
plt.legend(loc = "upper right")
plt.xlabel('Length of text', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

Compare number of useful words between positive and negative reviews

In [None]:
plt.hist(pos["numUseful"],np.linspace(0,30,30),density=True, alpha=0.4, label="Positive")
plt.hist(neg["numUseful"],np.linspace(0,30,30),density=True, alpha=0.4, label="Negative")
plt.legend(loc = "upper right")
plt.xlabel('Number of useful words', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

Compare number of capital letters between positive and negative reviews

In [None]:
plt.hist(pos["caps"],np.linspace(0,20,20),density=True,alpha=0.4, label="Positive")
plt.hist(neg["caps"],np.linspace(0,20,20),density=True, alpha=0.4, label="Negative")
plt.legend(loc = "upper right")
plt.xlabel('Number of capital letters', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

Compare number of punctuation between positive and negative reviews

In [None]:
plt.hist(pos["numPunc"],np.linspace(0,10,10),density=True, alpha=0.4, label="Positive")
plt.hist(neg["numPunc"],np.linspace(0,10,10),density=True, alpha=0.4,label="Negative")
plt.legend(loc = "upper right")
plt.xlabel('Number of punctuation marks', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

Based on the figures above, none of these features are indicative of whether a review is postive or negative.

## Prepare the data

Prepare dataframe to use TF-IDF

In [None]:
lines = []
for line in df['lemmatized']:
    text = ""
    for word in line:
        text += word + " "
    text.strip()
    lines.append(text)
df["lines"] = lines
df.head()

Apply TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectored = vectorizer.fit_transform(df['lines']).toarray()
vectored.shape
print(vectored)

Split Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectored, df['Sentiment'], test_size=0.3)

## Testing Models

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
pred = forest.predict(X_test)
print(classification_report(pred, y_test))
print(confusion_matrix(pred,y_test))
print(accuracy_score(pred, y_test))

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(classification_report(pred, y_test))
print(confusion_matrix(pred,y_test))
print(accuracy_score(pred, y_test))