In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv


### Load the Dataset into a DataFrame

In [2]:
df = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')

### Clean the Data, and equally distribute Positive's and Negative's

In [3]:
#First, create a Dataframe with just the Five Star Reviews

df_five = df.loc[(df["Rating"] == 5)]
df_five = df_five.reset_index(drop = True)

#Now, another one with Ratings 1 and 2, to represent the negative sentiments

#Note that we are leaving out the 3 and 4 star Reviews. This is simply because  the 1-2 star reviews are more 
#likely to capture the negative aspects while there are enough 5 star reviews to make up the posisitve part

df_neg = df.loc[df["Rating"] <3]
df_neg = df_neg.reset_index(drop = True)

#Since we want an equal number of Positive and Negative reviews, we limit the 5-star reviews to be equally distributed

df_pos = df_five.loc[:3213]

#Next, we join the two two Dataframes vertically and reset the indices

df_all = pd.concat([df_neg, df_pos], axis = 0)
df_all = df_all.reset_index(drop = True)

### Create a Sentiments Column

In [4]:
#Adding a Sentiment column to be "Positive" when Rating is 5, and Negative otherwise (1-2)

df_all["Sentiment"] = np.where(df_all["Rating"] == 5, "Positive", "Negative")

#Scramble the new dataset

df_all = df_all.sample(frac= 1)
df_all = df_all.reset_index(drop = True)

### Split into test and training examples

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_all.Review, df_all.Sentiment)
x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

### Vectorize all the Reviews to convert from Text to Sparse Matrix

In [6]:
#Using Bags Of Words, vectorize all the reviews

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
v = TfidfVectorizer()
x_train_vec = v.fit_transform(x_train)
x_test_vec = v.transform(x_test)

### Using an SVM model, fit the training data in vector form 

In [7]:
from sklearn import svm
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(x_train_vec, y_train)


SVC(kernel='linear')

### Find the Accuracy and F1 Score

In [8]:
#Accuracy of the model

clf_svm.score(x_test_vec, y_test)

0.9614187927815806

In [9]:
#Getting the F1 score to ensure there aren't too many false positives/negatives

from sklearn.metrics import f1_score

f1_score(y_test, clf_svm.predict(x_test_vec), average = None)

array([0.96050955, 0.9622871 ])

### Test the model with your own reviews

In [10]:
re1 = ["Beautiful location and ambience, cant wait to come back", "Incredibly overrated and dull", 
       "Enjoyed myself thoroughly"]
re1_vec = v.transform(re1)
clf_svm.predict(re1_vec)

array(['Positive', 'Negative', 'Positive'], dtype=object)