In [31]:
import pandas as pd
df = pd.read_csv('Data/review_rating.csv')

In [32]:
df.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",0
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,0
3,Aggie is Angela Lansbury who carries pocketboo...,0
4,I did not expect this type of book to be in li...,1


In [33]:
df.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [34]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk import WordNetLemmatizer
w= WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zaheerahmad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/zaheerahmad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zaheerahmad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
def clean_txt(file):
    if isinstance(file,pd.DataFrame):
        file = file.iloc[:,0]

    file = file.astype(str)

    corpus = []
    for i in file:
        text = i.lower()
        text = re.sub(r'(http|https|ftp|ssh)://[^\s]+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        text = BeautifulSoup(text, 'lxml').get_text()
        text = re.sub(r'[^a-z0-9 ]', ' ', text)
        words = [w for w in text.split() if w not in set(stopwords.words('english'))]
        text = ' '.join(words)
        text = re.sub(r'\s+', ' ', text).strip()
        corpus.append(text)
    return pd.Series(corpus)

In [39]:
def lemmatize_text(text):
    if isinstance(text, pd.DataFrame):
        text = text.iloc[:, 0]

    text = text.astype(str)

    corpus = []
    for i in text:
        words = i.split()
        words = [w.lemmatize(word) for word in words]
        words = ' '.join(words)
        corpus.append(words)
    return pd.Series(corpus)

In [40]:
X = df[['reviewText']]
y = df['rating']

In [41]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

In [43]:
cate = ['reviewText']

In [44]:
cate_pipeline = Pipeline(
    steps=[
        ("clean",FunctionTransformer(clean_txt, validate=False)),
        ("lemmatize",FunctionTransformer(lemmatize_text, validate=False)),
        ("wordtovec",TfidfVectorizer(
            max_features=2500,
            ngram_range=(1,2)
        ))
    ]
)

In [45]:
preprocess = ColumnTransformer(
    [
        ("final",cate_pipeline,cate)
    ]
)

In [46]:
X_train = preprocess.fit_transform(X_train)
X_test = preprocess.transform(X_test)

In [48]:
# pd.DataFrame(X_train.toarray())

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [50]:
models = {
    "LogisticRegression":LogisticRegression(),
    "MultinomialNB": MultinomialNB()
}

In [51]:
from sklearn.metrics import accuracy_score

In [52]:
score = []
for k,v in models.items():
    model = v
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    score2 = accuracy_score(y_test,y_pred)
    score.append(score2)


In [53]:
best = MultinomialNB()
best.fit(X_train,y_train)
y_prd = best.predict(X_test)
new = preprocess.transform(pd.DataFrame({
    "reviewText":["what an amazing personality"]
}))

In [54]:
res = best.predict(new)
if res == 1:
    result = "good review"
else:
    result = "bad review"

result

'good review'