In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("NYC_2021_airbnb_reviews_data1.csv")
df.head()

In [None]:
df.info()

In [None]:
id = df.listing_id.unique()
len(id)

In [None]:
review = df[["listing_id", "review_posted_date", "review"]]
review.head()

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
import re

def process(text):
    text = re.sub('[^a-zA-Z]+', ' ', text).strip()
    tokens = wordpunct_tokenize(text.lower())
    final_stopwords = stopwords.words('english') + stopwords.words('french') + stopwords.words('spanish') + stopwords.words('german')
    tokens_wo_stopwords = [t for t in tokens if t not in final_stopwords]
    text = " ".join(tokens_wo_stopwords)
    return(text)
process(review['review'][0])

In [None]:
review['clean_text'] = review['review'].apply(lambda text: process(text))
review.head()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
text = review['clean_text'][46]
print(text)
analyzer.polarity_scores(text)
analyzer.polarity_scores(" ")

In [None]:
review['review_score_cleaned'] = review['clean_text'].apply(lambda x:analyzer.polarity_scores(x)['compound'])
review.head()

In [None]:
review.drop('review', axis=1, inplace=True)

In [None]:
review.to_csv("reviews_with_sentiment.csv", header=False, index=False)

In [None]:
from matplotlib import pyplot as plt
review["review_score_cleaned"].hist()

!python sentiment_month_and_year.py -r local reviews_with_sentiment.csv --no-bootstrap-mrjob

Addon
1. mapreduce: look for the average (min and max too?) review_score_cleaned (sentiment) of each listing
2. mapreduce: look for the average review_score_cleaned of each month and year
3. mapreduce: ... of each month only
4. box plot of the review_score_cleaned of each listing
5. line graph of review_score_cleaned vs review_posted_date of each listing

In [None]:
%%file sentiment_listing.py
from mrjob.job import MRJob

class SentimentListing(MRJob):

    def mapper(self, _,line):
        val=line.split(',')
        listing_id = int(val[0])
        rsc = float(val[3])
        if rsc <= 1 and rsc >= -1:
            yield listing_id, rsc

    def reducer(self, key, values):
        count =0
        total =0
        max =-1
        min = 1

        for t in values:
            count = count + 1
            total += t
            if(t>max):
                max = t
            if(t<min):
                min = t
        yield key , {"count": count, "average":(total /count), "max":max, "min":min}


if __name__ == '__main__':
    SentimentListing.run()

In [None]:
!python sentiment_listing.py -r local reviews_with_sentiment.csv --no-bootstrap-mrjob > listing.txt

In [None]:
import json

listing = pd.read_csv("listing.txt", sep="\t", header=None)
listing.columns = ["listing_id", "stats"]
listing["stats"] = [json.loads(e) for e in listing["stats"]]
listing["count"] = [int(e["count"]) for e in listing["stats"]]
listing["average"] = [float(e["average"]) for e in listing["stats"]]
listing["max"] = [float(e["max"]) for e in listing["stats"]]
listing["min"] = [float(e["min"]) for e in listing["stats"]]
listing.drop("stats", axis=1, inplace=True)
listing.head()

In [None]:
listing = listing[listing['count'] >= 5]
listing.info()

In [None]:
(max_average, max_index) = (-1, 0)
(min_average, min_index) = (1, 0)
for index, row in listing.iterrows():
    if row["average"] > max_average:
        max_average = row["average"]
        max_index = index
    if row["average"] < min_average:
        min_average = row["average"]
        min_index = index
print(f'listing with max average:\n{listing.loc[max_index]}')
print()
print(f'listing with min average:\n{listing.loc[min_index]}')

In [None]:
group = review.groupby('listing_id')['review_score_cleaned'].apply(list)
group = group.reset_index(name='listvalues')
group

for i in range(1, 6):
    end = i * 50
    start = end - 50
    if i == 5:
        end = 218
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111)
    ax.boxplot(group['listvalues'][start: end], vert=1)
    ax.set_xticklabels(group['listing_id'][start: end])
    ax.set_xlabel('listing_id')
    ax.set_ylabel('review_score_cleaned')
    plt.xticks(rotation=90)
    plt.title("Scores of Listing Box Plot")
    plt.savefig(f'boxplots/bp{i}', bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(50, 7))
ax = fig.add_subplot(111)
ax.boxplot(group['listvalues'], vert=1)
ax.set_xticklabels(group['listing_id'])
ax.set_xlabel('listing_id')
ax.set_ylabel('review_score_cleaned')
plt.xticks(rotation=90)
plt.title("Scores of Listing Box Plot")
plt.savefig(f'boxplots/bp_long', bbox_inches='tight')

In [None]:
import datetime as dt
review['review_posted_date'] = pd.to_datetime(review['review_posted_date']).dt.to_period('m')
review['review_posted_date'] =review['review_posted_date'].dt.strftime('%Y-%m')
review['review_posted_date']

In [None]:
group2 = review.groupby('review_posted_date')['review_score_cleaned'].apply(list)
group2 = group2.reset_index()
group2 = group2.sort_values(by='review_posted_date')
group2

In [None]:
fig = plt.figure(figsize=(50, 7))
ax = fig.add_subplot(111)
ax.boxplot(group2['review_score_cleaned'], vert=1)
ax.set_xticklabels(group2['review_posted_date'])
ax.set_xlabel('review posted date')
ax.set_ylabel('review_score_cleaned')
plt.xticks(rotation=90)
plt.title("Scores of Listing Box Plot")
plt.savefig(f'boxplotsMY/bp_long', bbox_inches='tight')

In [None]:
month_year = pd.read_csv("month_year.txt", sep="\t", header=None)
month_year.columns = ["month_year", "stats"]
month_year["stats"] = [json.loads(e) for e in month_year["stats"]]
month_year["count"] = [int(e["count"]) for e in month_year["stats"]]
month_year["average"] = [float(e["average"]) for e in month_year["stats"]]
month_year["max"] = [float(e["max"]) for e in month_year["stats"]]
month_year["min"] = [float(e["min"]) for e in month_year["stats"]]
month_year.drop("stats", axis=1, inplace=True)
month_year.head()

In [None]:
month_year['month_year'] = pd.to_datetime(month_year['month_year']).dt.to_period('m').dt.strftime('%Y-%m')
month_year = month_year.sort_values(by='month_year')
month_year.head()

In [None]:
fig = plt.figure(figsize=(50, 7))
ax = fig.add_subplot(111)
ax.plot(month_year['month_year'], month_year['average'])
# ax.set_xticklabels(group2['review_posted_date'])
ax.set_xlabel('review posted date')
ax.set_ylabel('review_score_cleaned')
plt.xticks(rotation=90)
plt.title("Scores by Month and Year")
plt.savefig(f'linegraphMY/lg_long', bbox_inches='tight')

In [None]:
month = pd.read_csv("month.txt", sep="\t", header=None)
month.columns = ["month", "stats"]
month["stats"] = [json.loads(e) for e in month["stats"]]
month["count"] = [int(e["count"]) for e in month["stats"]]
month["average"] = [float(e["average"]) for e in month["stats"]]
month["max"] = [float(e["max"]) for e in month["stats"]]
month["min"] = [float(e["min"]) for e in month["stats"]]
month.drop("stats", axis=1, inplace=True)
month.head()

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
             'August', 'September', 'October', 'November', 'December']
month['month'] = pd.Categorical(month['month'], categories=months, ordered=True)
month = month.sort_values(by='month')
month.head()

In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111)
ax.plot(month['month'], month['average'])
# ax.set_xticklabels(group2['review_posted_date'])
ax.set_xlabel('review posted date')
ax.set_ylabel('review_score_cleaned')
plt.xticks(rotation=45)
plt.title("Scores by Month")
plt.savefig(f'linegraphM/lg_long', bbox_inches='tight')

In [None]:
%%file sentiment_month_and_year.py
from mrjob.job import MRJob
import re

DATE_RE = re.compile(r"[a-zA-Z]* [0-9]{4}")

class SentimentMonthYear(MRJob):

    def mapper(self, _, line):
        val = line.split(',')
        (review_posted_date, review_score_cleaned) = (val[1].strip(), float(val[3]))
        if (re.match(DATE_RE, review_posted_date) and (-1 <= review_score_cleaned <= 1)):
            yield review_posted_date, review_score_cleaned

    def reducer(self, key, values):
        total = 0.0
        count = 0
        max = -1
        min = 1
        for value in values:
            total += value
            count += 1
            if(value > max):
                max = value
            if(value < min):
                min = value
        yield key, {"count": count, "average":(total / count), "max":max, "min":min}

if __name__ == '__main__':
    SentimentMonthYear.run()

In [None]:
!python sentiment_month_and_year.py -r local reviews_with_sentiment.csv --no-bootstrap-mrjob > month_year.txt

In [None]:
%%file sentiment_month.py
from mrjob.job import MRJob
import re

DATE_RE = re.compile(r"[a-zA-Z]* [0-9]{4}")

class SentimentMonth(MRJob):

    def mapper(self, _, line):
        val = line.split(',')
        (review_posted_date, review_score_cleaned) = (val[1].strip(), float(val[3]))
        if (re.match(DATE_RE, review_posted_date) and (-1 <= review_score_cleaned <= 1)):
            month = review_posted_date.split()[0]
            yield month, review_score_cleaned

    def reducer(self, key, values):
        total = 0.0
        count = 0
        max = -1
        min = 1
        for value in values:
            total += value
            count += 1
            if(value > max):
                max = value
            if(value < min):
                min = value
        yield key, {"count": count, "average":(total / count), "max":max, "min":min}

if __name__ == '__main__':
    SentimentMonth.run()

In [None]:
!python sentiment_month.py -r local reviews_with_sentiment.csv --no-bootstrap-mrjob > month.txt

In [None]:
%%file sentiment_words.py
from mrjob.job import MRJob

class SentimentWords(MRJob):

    def mapper(self, _, line):
        val = line.split(',')
        (clean_text, review_score_cleaned) = (val[2].strip(), float(val[3]))
        if ((len(clean_text) > 0) and (-1 <= review_score_cleaned <= 1)):
            frequency = {}
            words = clean_text.split()
            for word in words:
                if len(word) > 1:
                    if word in frequency.keys():
                        frequency[word] += 1
                    else:
                        frequency[word] = 1
            sentiment = "positive"
            if review_score_cleaned <= -0.05:
                sentiment = "negative"
            elif review_score_cleaned < 0.05:
                sentiment = "neutral"
            frequency = dict(sorted(frequency.items(),
                                key=lambda item: item[1], reverse=True))
            yield sentiment, frequency

    def reducer(self, key, values):
        total_frequency = {}
        for frequency in values:
            for word in frequency:
                if word in total_frequency.keys():
                    total_frequency[word] += frequency[word]
                else:
                    total_frequency[word] = frequency[word]
        total_frequency = dict(filter(lambda item: item[1] > 1, total_frequency.items()))
        total_frequency = dict(sorted(total_frequency.items(),
                                key=lambda item: item[1], reverse=True))
        yield key, total_frequency

if __name__ == '__main__':
    SentimentWords.run()

In [None]:
!python sentiment_words.py -r local reviews_with_sentiment.csv --no-bootstrap-mrjob > words.txt