In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
rev = pd.read_csv('yelp.csv')
rev.head(3)

In [None]:
date = pd.DatetimeIndex(rev['date'])
rev['year'] =  date.year
rev['month'] = date.month
rev['day'] = date.strftime('%a')

In [None]:
rev.head(2)

In [None]:
sns.countplot(rev['stars'])

In [None]:
reviews = rev[['business_id','user_id', 'date','text','year','month','day','stars']]
c1 = reviews['stars'] == 5
c2 = reviews['stars'] == 2
reviews = reviews[c1|c2]

In [None]:
reviews.head(2)

In [None]:
from sklearn.model_selection import train_test_split                  #split the data 'reviews' into train & test
train, test= train_test_split(reviews, test_size=0.25, random_state=42)

In [None]:
train = train.sort_values('date')    #got the target class and text
y_train = train['stars']
X_train = train['text']

y_test = test['stars']
X_test = test['text']

In [None]:
sub_X_train = train[['business_id', 'user_id', 'day']]  #put the other categorical columns in another dataframe
                                                        #for target encoding
sub_X_test  = test[['business_id', 'user_id', 'day']]


In [None]:
import category_encoders as ce                             
def cat_trans(a,b,c):                                   #function for target encoding
    cat_feat = ['business_id', 'user_id', 'day']
    t_encoder = ce.TargetEncoder(cols = cat_feat)
    t_encoder.fit(a, c)
    sub_X_train = t_encoder.transform(a)
    sub_X_test = t_encoder.transform(b)
    return (sub_X_train, sub_X_test)

In [None]:
a = cat_trans(sub_X_train, sub_X_test, y_train)   #retrieving the results of the function

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
import string

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
stemmer = PorterStemmer()
lema = WordNetLemmatizer()

In [None]:
import re

In [None]:
def clean_a_text(text):                               #function for cleaning the text
    split_sentence = nltk.word_tokenize(text)
    no_punc = [char.lower() for char in split_sentence if char not in string.punctuation]
    sw_removed = [word for word in no_punc if word not in stopwords]
    lemmatized = [lema.lemmatize(word, pos = wordnet.VERB) for word in sw_removed]
    pattern = "[^~''`/*0-9... :]+"
    patt = '[a-zA-Z][a-zA-Z]+'
    final = re.findall(pattern, ' '.join(lemmatized))
    final2 = re.findall(patt, ' '.join(final))
    return final2

In [None]:
from wordcloud import WordCloud
                                                    #function for plotting word cloud of all words in the text
def plot_word_cloud(text):
    wordcloud_instance = WordCloud(width = 1500, height = 900, 
                background_color ='black', 
                stopwords=None,
                min_font_size = 10).generate(text)
    
    plt.figure(figsize = (10, 8), facecolor = None) 
    plt.imshow(wordcloud_instance) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()

In [None]:
words = ''                                             #combine all cleaned words from text into a string
for index, row in reviews[['text']].iterrows():
    for each in ' '.join(clean_a_text(row['text'])):
        words = words + each

In [None]:
plot_word_cloud(words)

In [None]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [None]:
words_vector = CountVectorizer(analyzer= clean_a_text)

In [None]:
words_vector.fit(train['text'])

In [None]:
X_train_values = words_vector.transform(train['text'])
X_test_values = words_vector.transform(test['text'])

In [None]:
X_train_values.shape, X_test_values.shape

In [None]:
tfidf = TfidfTransformer()

In [None]:
X_train_values = tfidf.fit_transform(X_train_values)
X_test_values = tfidf.fit_transform(X_test_values)

In [None]:
features = words_vector.get_feature_names()   #got each word derived from the CountVectorizer analyzer as features

In [None]:
len(features)

In [None]:
X_train_after_tfidf = pd.DataFrame(X_train_values.toarray(), columns= features, index= train.index) #dataframe of values after
                                                                                                    #countvectorizer and tfidf
X_text_after_tfidf = pd.DataFrame(X_test_values.toarray(), columns= features, index= test.index)

In [None]:
train_y_m = train[['year', 'month']]     #numeric dataframe to be concatenated with tfidf and target encoded dataframe
test_y_m = test[['year', 'month']]

In [None]:
X_train = pd.concat([a[0], train_y_m , X_train_after_tfidf], axis = 1)  #final dataframe

X_test = pd.concat([a[1], test_y_m , X_text_after_tfidf], axis = 1)

In [None]:
#X_train.head(2)

In [None]:
#X_test.head(2)

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
my_pipeline = make_pipeline(SMOTE(),     
                        MultinomialNB())

In [None]:
my_pipeline.fit(X_train.values, y_train.values)

In [None]:
pred = my_pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))