In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import csv

In [None]:
# pd.set_option('display.max_rows', 100)
# pd.set_option('max_colwidth', 500)
# pd.set_option('display.max_columns', 500)

reviews = pd.read_csv('reviews.csv')
# reviews.head()
df_host = pd.read_csv('listings_cleaned.csv')
# df_host.head(10)

In [None]:
#removing unnecessary columns
#can do this much more efficiently later
host_filtered = df_host.drop(df_host.iloc[:, 11:59], axis = 1)
host_filtered.head()
# reviews.head()

In [None]:
host_filtered.drop(['scrape_id', 'last_scraped', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'last_review'], axis = 1, inplace = True)

In [None]:
host_filtered.drop(host_filtered.iloc[:, 10:], axis = 1, inplace = True)
host_filtered.head()

In [None]:
#renaming columns for merge on ID/listng ID
reviews.drop(['reviewer_id', 'id'], axis = 1, inplace = True)
reviews.rename(columns = {'listing_id': 'id'}, inplace = True)
reviews.head()

In [None]:
df_stars = host_filtered.drop(df_host.iloc[:, 1:2], axis = 1)
df_stars.head(10)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(2, 3)
fig.tight_layout()

ax[0, 0].hist(df_stars["review_scores_value"]) #row=0, col=0
ax[0, 0].set_title('Value')
ax[1, 0].hist(df_stars["review_scores_checkin"]) #row=1, col=0
ax[1, 0].set_title('Check In')
ax[0, 1].hist(df_stars["review_scores_cleanliness"]) #row=0, col=1
ax[0, 1].set_title('Cleanliness')
ax[1, 1].hist(df_stars["review_scores_communication"]) #row=1, col=1
ax[1, 1].set_title('Communication')
ax[1, 2].hist(df_stars["review_scores_location"]) #row=1, col=2
ax[1 ,2].set_title('Location')
plt.show()

In [None]:
#plot score rating by loc
#if host has multiple listings, take average

#appending price col to host_filtered
price = df_host['price']

host_filtered = host_filtered.join(price)
host_filtered.head()


In [None]:
df_merged = pd.merge(reviews, host_filtered, on = 'id')
#Removing review scores for this df
df_merged.drop(df_merged.iloc[:, 7:12], axis = 1, inplace = True)
df_merged.head()

In [None]:
#Check missing values
df_merged.drop(['review_scores_value'], axis = 1, inplace = True)
df_merged['comments'].isna().sum()

In [None]:
import itertools
import collections

#convert text to lowercase
def convert(lst): 
    return ([i for item in lst for i in item.lower().split()]) 

filtered_comments = convert(df_merged['comments'].fillna("")) #Fill in missing reviews with blank


In [None]:
count_word = collections.Counter(filtered_comments) #Count word frequency

clean_word_count = pd.DataFrame(count_word.most_common(5),
                             columns=['words', 'count'])

clean_word_count.head()


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words(['english']))
#View sample set of stopwords
list(stop_words)[0:10]

In [None]:
words_review_cleaned = [word for word in filtered_comments if word not in stop_words]
words_review_count = collections.Counter(words_review_cleaned)

word_review_count_df = pd.DataFrame(words_review_count.most_common(15),
                             columns=['words', 'count'])

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
word_review_count_df.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Most Frequent Words")

plt.show()

In [None]:
df_merged.info()

In [None]:
df_merged.shape
# df_sample_set = df_merged.iloc[0:100000, :]
# df_sample_set.head()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

In [None]:
#Identifying PScores with builtin Vader SIA
SIA = SentimentIntensityAnalyzer()

positive sentiment: score >= 0.05
neutral sentiment: score > -0.05 and score < 0.05
negative sentiment: score <= -0.05

In [None]:
# df_sample_set.drop(['listing_url', 'host_name', 'price'], axis = 1, inplace = True)
df_merged.drop(['listing_url', 'host_name'], axis = 1, inplace = True)

In [None]:
df_merged['comments'] = df_merged["comments"].apply(str)

In [None]:
# df_sample_set.fillna(0, inplace = True)
df_merged.info()

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

More data clenaing
Extracting root words and removing punctuation etc.

In [None]:
#Grabbing root words
p_stemmer = PorterStemmer()
def stem(sentence):
    tokens = sentence.split()
    stemmed_tokens = [p_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# df_sample_set = df_sample_set[df_sample_set['comments'].notnull()]
df_merged = df_merged[df_merged['comments'].notnull()]
df_merged['comments'] = df_merged['comments'].apply(stem)

In [None]:
pd.options.mode.chained_assignment = None
stopwords_list = set(stopwords.words("english"))
#List of punctuation to remove
#Handle html tags?
punctuations = """!()-![]{};:,+'"\,<>./?@#$%^&*_~Â""" 

def reviewParse(comments):
    #Split the review into words
    splitReview = comments.split()
    #Takes punctuation out
    parsedReview = " ".join([word.translate(str.maketrans('', '', punctuations)) + " " for word in splitReview])
    return parsedReview
  
def clean_review(comments):
    #Makes all words lowercase
    clean_words = []
    splitReview = comments.split()
    for w in splitReview:
        if w.isalpha() and w not in stopwords_list:
            clean_words.append(w.lower())
    #Joins split words back into sentence
    clean_review = " ".join(clean_words)
    return clean_review

df_merged = df_merged[df_merged['comments'].notnull()]
df_merged['comments'] = df_merged['comments'].apply(reviewParse).apply(clean_review)

In [None]:
#This does not properly rounds all values, no 1 or 2 ratings
#Need to find way to properly round values down beforehand
# df_sample_set['review_scores_rating'] = df_sample_set['review_scores_rating'].astype(np.int64)
# df_sample_set.info()
df_merged.info()

In [None]:
# # df_sample_set['Sentiment'] = df_sample_set['review_scores_rating'].round(decimals = 2).apply(sentiment)
# # df_sample_set.loc('comments')
# df_test = pd.DataFrame()
# # df_test['sentiment'] = df_test['round_review'].apply(sentiment)
# # print(type(df_clean['review_scores_rating']))
# df_test.info()
# # df_test.head()

In [None]:
docs = list(df_merged['comments'])[:200000]

- tfidf(t, d, D) = tf(t, d) * idf(t, D)
- t = term
- d = document
- D = set of documents
- TF-IDF provides a weight
- This weight is a statistical measure used to evaluate how important 
a word is to a document in a collection or corpus. 
- The importance increases proportionally to the number of times a 
word appears in the document but is offset by the frequency of 
the word in the corpus (data-set).

In [None]:
#Running SIA on cleaned dataset
SIA = SentimentIntensityAnalyzer()
for sentence in df_merged['comments'].values[5:10]:
    print(sentence)
    ss = SIA.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()

In [None]:
#Correlation between original stars rating and price is low
corr_1 = df_merged["review_scores_rating"]
corr_2 = df_merged["price"]
correlation = corr_2.corr(corr_1)
print(correlation)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
#Settings for count vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 2000) 
 
#Send all docs here 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)

In [None]:
# print(tfidf_vectorizer.get_feature_names())

In [None]:
import plotly.graph_objs as go

In [None]:
fig = go.Figure([go.Bar(x = df_merged.review_scores_rating.value_counts().index, y = df_merged.review_scores_rating.value_counts().tolist())])
fig.update_layout(
    title="Values in each Sentiment",
    xaxis_title = "Sentiment",
    yaxis_title = "Values")
fig.show()

In [None]:
df_train_set = df_merged.dropna()
df_train_set.head(10)
# df_train_set.to_csv("airbnb-train-set.csv",index=False)

In [None]:
df_train_set["sentiment_scores"] = df_train_set["comments"].apply(lambda x: SIA.polarity_scores(x))
df_train_set = pd.concat([df_train_set.drop(['sentiment_scores'], axis = 1), df_train_set['sentiment_scores'].apply(pd.Series)], axis = 1)
df_train_set.head()

In [None]:
# df_train_set['polarity'] = df_train_set['']

In [None]:
# tfidf_vectorizer.get_feature_names()
# df_train_set.drop_duplicates(inplace=True)
X = tfidf_vectorizer_vectors.toarray()
Y = df_train_set['review_scores_rating'].astype(int)[:200000]

In [None]:
len(X[0])

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV 
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix, classification_report, roc_auc_score,roc_curve,auc
from sklearn.tree import DecisionTreeClassifier

#Dividing into train and validation sets

SEED = 123

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = SEED)

In [None]:
dt = DecisionTreeClassifier(random_state=SEED)
dt.fit(X_train,y_train)
y_pred_test = dt.predict(X_test)
print("Training Accuracy score: "+str(round(accuracy_score(y_train,dt.predict(X_train)),4)))
print("Testing Accuracy score: "+str(round(accuracy_score(y_test,dt.predict(X_test)),4)))

In [None]:
#Must assign neutral values to positive
#Neutral scores tend to have positive sentiment utilizing neutral words
print(classification_report(y_test, y_pred_test, target_names = ['ignore_1', 'ignore_2', 'positive', 'negative']))

In [None]:
cm = confusion_matrix(y_test, y_pred_test)
#print('Confusion matrix\n', cm)
cm_matrix = pd.DataFrame(data=cm, columns=['Ignore', 'Ignore_2', 'Actual Negative', 'Actual Positive'], 
                        index=['Ignore', 'Ignore_2', 'Predict Negative', 'Predict Positive'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_train = gnb.predict(X_train)
y_pred_test = gnb.predict(X_test)
print("Training Accuracy score: " + str(round(accuracy_score(y_train, gnb.predict(X_train)), 4)))
print("Testing Accuracy score: " + str(round(accuracy_score(y_test, gnb.predict(X_test)), 4)))

In [None]:
print(classification_report(y_test, y_pred_test, target_names=['ignore', 'ignore_2', 'positive', 'negative']))

In [None]:
cm = confusion_matrix(y_test, y_pred_test)
#print('Confusion matrix\n', cm)
cm_matrix = pd.DataFrame(data=cm, columns=['ignore', 'ignore_2', 'Actual Negative', 'Actual Positive'], 
                        index=['ignore', 'ignore_2', 'Predict Negative', 'Predict Positive'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=SEED).fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print("Training Accuracy score: "+str(round(accuracy_score(y_train,lr.predict(X_train)),4)))
print("Testing Accuracy score: "+str(round(accuracy_score(y_test,lr.predict(X_test)),4)))

In [None]:
print(classification_report(y_test, y_pred_test, target_names=['ignore', 'ignore_2', 'positive', 'negative']))

In [None]:
cm = confusion_matrix(y_test, y_pred_test)
#print('Confusion matrix\n', cm)
cm_matrix = pd.DataFrame(data=cm, columns=['ignore', 'ignore_2', 'Actual Positive', 'Actual Negative'], 
                        index=['ignore', 'ignore_2', 'Predict Positive', 'Predict Negative'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print("Training Accuracy score: "+str(round(accuracy_score(y_train,clf.predict(X_train)), 4)))
print("Testing Accuracy score: "+str(round(accuracy_score(y_test,clf.predict(X_test)), 4)))

In [None]:
print(classification_report(y_test, y_pred_test, target_names=['ignore', 'ignore_2', 'positive', 'negative']))

In [None]:
cm = confusion_matrix(y_test, y_pred_test)
#print('Confusion matrix\n', cm)
cm_matrix = pd.DataFrame(data=cm, columns=['ignore', 'ignore_2', 'Actual Positive', 'Actual Negative'], 
                        index=['ignore', 'ignore_2', 'Predict Positive', 'Predict Negative'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
plt.show()

In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Decision Tree', dt),
               ('Logistic Regression', lr),
                ('Naive Bayes', gnb)
              ]
vc = VotingClassifier(estimators=classifiers)
# Fit 'vc' to the traing set and predict test set labels
vc.fit(X_train, y_train)

print("Training Accuracy score: " + str(round(accuracy_score(y_train,vc.predict(X_train)), 4)))
print("Testing Accuracy score: " + str(round(accuracy_score(y_test,vc.predict(X_test)), 4)))

In [None]:
#Vectorizer setup for all models
docs_2 = list(df_merged['comments'])[:214541]

tfidf_vectorizer_2 = TfidfVectorizer(use_idf=True, max_features = 2000) 
 
#Send all docs here 
tfidf_vectorizer_vectors_2 = tfidf_vectorizer_2.fit_transform(docs)

In [None]:
vectors = tfidf_vectorizer_2.transform(docs_2)
words_df = pd.DataFrame(vectors.toarray(), columns = tfidf_vectorizer_2.get_feature_names_out())
words_df.head(-100)

In [None]:
words_df_clean = words_df.dropna()
words_df_clean.info()

In [None]:
# Predict using all our models. 

# Logistic Regression predictions + probabilities
df_train_set['pred_logreg'] = lr.predict(words_df)
df_train_set['pred_logreg_proba'] = lr.predict_proba(words_df)[:,1]

# Decision Tree predictions + probabilities
df_train_set['pred_forest'] = dt.predict(words_df)
df_train_set['pred_forest_proba'] = dt.predict_proba(words_df)[:,1]

# Bayes predictions + probabilities
df_train_set['pred_bayes'] = gnb.predict(words_df)
df_train_set['pred_bayes_proba'] = gnb.predict_proba(words_df)[:,1]



# df_train_set = pd.concat([df_train_set.drop(['sentiment_scores'], axis = 1), df_train_set['sentiment_scores'].apply(pd.Series)], axis = 1)


In [None]:
df_train_set['rand_forest'] = clf.predict(words_df)
df_train_set['rand_forest_prob'] = clf.predict_proba(words_df)[:,1]

In [None]:
# Voting Classifier predictions + probabilities
df_train_set['vc_class'] = vc.predict(words_df)
df_train_set['vc_class_prob'] = vc.predict_proba(words_df)[:,1]

In [None]:
df_train_set.head(100)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, mean_absolute_error

In [None]:
y_preds = gnb.predict(X_test)

mean_squared_error(y_test, y_preds)

In [None]:
from math import sqrt
rmse = sqrt(mean_squared_error(y_test, y_preds))
rmse

In [None]:
y_preds[:10]

In [None]:
# def compare_model_residuals(models,X,y):
#     f, (ax1, ax2, ax3) = plt.subplots(3, sharex=True, sharey=True)
#     plt.title('Plotting residuals using training (blue) and test (green) data')
#     mean_sq_e = []
#     for m, ax in ((models[0], ax1),(models[1], ax2),(models[2], ax3)):
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#         m[0].fit(X_train, y_train)
#         y_preds_train = m[0].predict(X_train)
#         y_preds_test = m[0].predict(X_test)
#         ax.scatter(m[0].predict(X_train), y_preds_train - y_train,c='#2B94E9',s=40,alpha=0.5)
#         ax.scatter(m[0].predict(X_test), y_preds_test - y_test,c='#94BA65',s=40)
#         ax.hlines(y=0, xmin=0, xmax=100)
#         ax.set_title(m[1])
#         ax.set_ylabel('Residuals')
#         mean_sq_e.append("Model {} with absolute error {}".format(m[1], str(mean_absolute_error(y_test,y_preds_test))))
#     plt.xlim([20,70])
#     plt.ylim([-100,100])  
#     plt.show()
#     print(mean_sq_e)
# models = np.array([(dt,'Decision Tree'), (lr,'Logistic Regression'), (clf,'Random Forest')])
# compare_model_residuals(models, X, Y)

In [None]:
predictions = pd.Series(lr.predict(X), name="sentiment")
results = pd.concat([predictions],axis=1)
results.to_csv("airbnb-review-sentiment.csv",index=False)

Hence we successfully studied various models like Decision Tree, SVM, Naive Bayes and Logistic Regression and implemented them for the given dataset as part of the experiment along with a comparative analysis of various metrics and made the following observations.

Naïve Bayes and Decision Tree are susceptible to noise if present in the dataset because when we reduced the number of features by considering only the most frequent words the accuracy and AUC score increased significantly.
Logistic Regression and SVM performed almost same for the given dataset even with the initial number of features.
We can increase Accuracy marginally by removing Named Entities using spacy and performing Lemmatization on top of that on all the models mentioned above.

In [None]:
# add sentiment anaylsis columns
# df_sample_set["sentiment_scores"] = df_sample_set["comments"].apply(lambda x: SIA.polarity_scores(x))
# df_sample_set = pd.concat([df_sample_set.drop(['sentiment_scores'], axis = 1), df_sample_set['sentiment_scores'].apply(pd.Series)], axis = 1)
# df_sample_set.info()

In [None]:
#Calculates pscores (builtin nltk sentiment analysis)
pscores = [SIA.polarity_scores(comments) for comments in df_train_set['comments']]

In [None]:
pd.Series([score['compound'] for score in pscores]).plot(kind = 'hist')
plt.title('Compound Scores')
plt.xlabel('Scores')
plt.ylabel('frequency')

In [None]:
pd.Series([score['neu'] for score in pscores]).plot(kind='hist')
plt.title('Neutral')
plt.xlabel('Scores')
plt.ylabel('Frequency')

In [None]:
pd.Series([score['pos'] for score in pscores]).plot(kind='hist')
plt.title('Positive Scores')
plt.xlabel('Scores')
plt.ylabel('Frequency')

In [None]:
pd.Series([score['neg'] for score in pscores]).plot(kind='hist', bins=25)
plt.title('Negative Scores')
plt.xlabel('Scores')
plt.ylabel('Frequency')

In [None]:
scored_reviews = pd.DataFrame()
scored_reviews['review'] = [r for r in df_train_set['comments']]
scored_reviews['compound'] = [score['compound'] for score in pscores]
scored_reviews['negativity'] = [score['neg'] for score in pscores]
scored_reviews['neutrality'] = [score['neu'] for score in pscores]
scored_reviews['positivity'] = [score['pos'] for score in pscores]

In [None]:
scored_reviews.query('negativity > 0')

Not many negative reviews

In [None]:
scored_reviews.query('negativity > positivity').query('negativity > 0.1')

In [None]:
#TODO: nltk built in sentiment analysis seems to be not as accurate as I'd like
# marking comments negative even though overall sentiment is positive

In [None]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline
nltk.download(['punkt', 'wordnet', 'stopwords'])
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [None]:
def tokenize(text):
    '''
    Input: Text String (str)
    
    Process: 
    1. Tokenize text into tokens
    2. Remove stop words
    3. Lemmatize
    
    Output: List of text tokens for string
    '''
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) 
    lemmatizer = WordNetLemmatizer()
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [lemmatizer.lemmatize(w.lower().strip()) for w in tokens]
    return tokens

In [None]:
# Initialize ML pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestRegressor())
])

In [None]:
# Import train_test_split, Split data
from sklearn.model_selection import train_test_split
X = df_test['comments']
Y = df_test['price']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
# Fit pipeline
# Takes too long to run
pipeline.fit(X_train, y_train)

In [None]:
y_preds = pipeline.predict(X_test)

In [None]:
y_preds = pipeline.predict(X_test)

mean_squared_error(y_test, y_preds)

In [None]:
from math import sqrt
rmse = sqrt(mean_squared_error(y_test, y_preds))
rmse


In [None]:
y_preds[:10]