In [None]:
!pip install wordcloud

In [None]:
import pandas as pd

data = pd.read_csv('reviews.csv')
data = data.head(10000)

data.tail()

In [None]:
data.isnull().sum()

In [None]:
data.drop(['Time_submitted', 'Reply'], axis=1, inplace=True)
data.isnull().sum()

In [None]:
import seaborn as sns
sns.countplot(x='Rating', data=data, palette='pastel')

In [None]:
def transform_ratings(rating):
    if rating == 5 or rating == 4:
        return "Good"
    if rating == 3:
        return "Neutral"
    if rating == 2 or rating == 1:
        return "Bad"

In [None]:
# Create a new column ‘Desc’ by transforming the Rating into a character shape
data['Desc'] = data['Rating'].apply(transform_ratings)

# Create a new column ‘length’ filled with the number of words in the Review column
data['length'] = data['Review'].str.len()

sns.countplot(x='Desc', data=data, palette='pastel')
data.head()

In [None]:
sns.scatterplot(x=data['length'][data.length < 500], y=data['Total_thumbsup'][data.Total_thumbsup < 800], hue=data['Desc'])

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

reviews = " ".join(data.loc[:,'Review'])

# Create a wordcloud based on all the rows in the “Review” column
wordcloud = WordCloud(
    width = 800, height = 800,
    background_color ='white',
    min_font_size = 10
).generate(reviews)

plt.imshow(wordcloud)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Change all words into lowercase/small letter
data.loc[:,'Review'] = data.loc[:,'Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Remove characters that are not included in alphanumeric (remove special characters)
data.loc[:,'Review'] = data.loc[:,'Review'].apply(lambda word: re.sub('[^a-z A-Z 0-9-]+', '', word))

# Remove stopwords
stop_words = stopwords.words('english')
data.loc[:,'Review'] = data.loc[:,'Review'].apply(lambda word: " ".join(word for word in word.split() if word not in stop_words))

# Do stemming process in ‘Review’ column
stemmer = PorterStemmer()
data.loc[:,'Review'] = data.loc[:,'Review'].apply(lambda 
word:" ".join(stemmer.stem(word) for word in word.split()))

data.head()["Review"]

In [None]:
from sklearn.model_selection import train_test_split

X = data['Review']
y = data['Desc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

y_train

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer()

# fit() used for learning vocabulary and searching unique words in the text
cv.fit(X_train)

# transform() used to convert words into numerical format
X_train_count = cv.transform(X_train)
X_test_count = cv.transform(X_test)

print(X_train_count)

In [None]:
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2,3))
tfidf.fit(X_train)

X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf)