# Creating data file regarding linguistic features of the yelp dataset, including sentiment, subjectivity, readability, squared user review rating, word count of reviews. 

In [1]:
from fastparquet import ParquetFile
import pandas as pd

import time
import statistics

from textblob import TextBlob # for sentiment & subjectivity
import textstat # for readability

In [2]:
#import data
df = pd.read_parquet("joined.parquet.snappy")
df

Unnamed: 0,r_id,b_id,u_id,r_stars,r_date,r_text,r_useful,r_funny,r_cool,b_stars,b_review_count,u_review_count,u_yelping_since,u_friends_count
0,4,6317,164431,5,2015-01-04 00:01:03,"Wow! Yummy, different, delicious. Our favo...",1,0,1,4.0,181,9,2014-01-17 19:20:57,1
1,9,914,154297,3,2016-03-30 22:46:33,This easter instead of going to Lopez Lake we ...,1,1,0,4.5,13,24,2015-10-27 22:53:34,1
2,11,3877,35412,5,2015-06-21 14:48:06,My experience with Shalimar was nothing but wo...,2,0,0,2.5,8,39,2009-01-27 21:20:30,7
3,18,12041,21756,4,2014-08-10 19:41:43,The hubby and I have been here on multiple occ...,1,0,0,4.0,398,74,2009-07-24 14:30:28,112
4,19,295,80128,5,2016-03-07 00:02:18,I go to blow bar to get my brows done by natal...,2,0,1,4.0,55,27,2012-02-26 05:18:05,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576650,6990276,138213,246850,5,2014-12-17 21:45:20,Latest addition to services from ICCU is Apple...,1,2,1,2.5,36,327,2011-08-05 22:13:01,66
2576651,6990277,144721,1192959,5,2021-03-31 16:55:10,"This spot offers a great, affordable east week...",2,1,2,4.5,30,41,2011-10-18 13:28:02,22
2576652,6990278,149531,116946,4,2019-12-30 03:56:30,This Home Depot won me over when I needed to g...,1,0,0,2.5,32,326,2012-01-02 01:18:09,10
2576653,6990279,138952,199547,5,2022-01-19 18:59:27,For when I'm feeling like ignoring my calorie-...,1,0,0,4.5,83,281,2008-09-10 21:03:38,143


In [3]:
text = df["r_text"]
text

0          Wow!  Yummy, different,  delicious.   Our favo...
1          This easter instead of going to Lopez Lake we ...
2          My experience with Shalimar was nothing but wo...
3          The hubby and I have been here on multiple occ...
4          I go to blow bar to get my brows done by natal...
                                 ...                        
2576650    Latest addition to services from ICCU is Apple...
2576651    This spot offers a great, affordable east week...
2576652    This Home Depot won me over when I needed to g...
2576653    For when I'm feeling like ignoring my calorie-...
2576654    Located in the 'Walking District' in Nashville...
Name: r_text, Length: 2576655, dtype: object

In [4]:
# test sentiment function from textblob package
tests = TextBlob(text[0])
tests.sentiment

Sentiment(polarity=0.3025568181818182, subjectivity=0.7130681818181818)

In [5]:
#function that average sentiment and subjectivity of review texts by sentence
def textblob_sentiment_subjectivity(row):
    row = TextBlob(row)
    sensum = 0
    subsum = 0
    for sentence in row.sentences:
        sensum+=sentence.sentiment[0]
        subsum+=sentence.sentiment[1]
    sentiment = sensum/len(row.sentences)
    subjectivity = subsum/len(row.sentences)
    return sentiment, subjectivity

In [6]:
#test average function on a subset of data
subset = text[:10000]
start_time = time.time()
sentiments = subset.apply(textblob_sentiment_subjectivity)
end_time = time.time()
#estimate time cost
print("--- %s seconds ---" % (end_time - start_time))
print("for 2576655 rows:", 2576655/10000 * (end_time - start_time)/60, "min")

--- 9.049647331237793 seconds ---
for 2576655 rows: 38.863031740450865 min


In [7]:
#apply on the whole data
start_time = time.time()
sentiments = text.apply(textblob_sentiment_subjectivity)
end_time = time.time()
print("--- %s minutes  ---" % ((end_time - start_time)/60))

--- 40.148081147670744 minutes  ---


In [8]:
# add columns to dataframe
df["r_sen"] = round(sentiments.str[0], 6)
df["r_sub"] = round(sentiments.str[1], 6)

In [9]:
df

Unnamed: 0,r_id,b_id,u_id,r_stars,r_date,r_text,r_useful,r_funny,r_cool,b_stars,b_review_count,u_review_count,u_yelping_since,u_friends_count,r_sen,r_sub
0,4,6317,164431,5,2015-01-04 00:01:03,"Wow! Yummy, different, delicious. Our favo...",1,0,1,4.0,181,9,2014-01-17 19:20:57,1,0.305871,0.775379
1,9,914,154297,3,2016-03-30 22:46:33,This easter instead of going to Lopez Lake we ...,1,1,0,4.5,13,24,2015-10-27 22:53:34,1,0.027222,0.558611
2,11,3877,35412,5,2015-06-21 14:48:06,My experience with Shalimar was nothing but wo...,2,0,0,2.5,8,39,2009-01-27 21:20:30,7,0.280583,0.489970
3,18,12041,21756,4,2014-08-10 19:41:43,The hubby and I have been here on multiple occ...,1,0,0,4.0,398,74,2009-07-24 14:30:28,112,0.130754,0.289683
4,19,295,80128,5,2016-03-07 00:02:18,I go to blow bar to get my brows done by natal...,2,0,1,4.0,55,27,2012-02-26 05:18:05,8,0.236861,0.368667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576650,6990276,138213,246850,5,2014-12-17 21:45:20,Latest addition to services from ICCU is Apple...,1,2,1,2.5,36,327,2011-08-05 22:13:01,66,0.166667,0.300000
2576651,6990277,144721,1192959,5,2021-03-31 16:55:10,"This spot offers a great, affordable east week...",2,1,2,4.5,30,41,2011-10-18 13:28:02,22,0.152222,0.155000
2576652,6990278,149531,116946,4,2019-12-30 03:56:30,This Home Depot won me over when I needed to g...,1,0,0,2.5,32,326,2012-01-02 01:18:09,10,0.146032,0.237302
2576653,6990279,138952,199547,5,2022-01-19 18:59:27,For when I'm feeling like ignoring my calorie-...,1,0,0,4.5,83,281,2008-09-10 21:03:38,143,0.184919,0.332008


In [11]:
# check if values
print("Mean     SD       Median  Min  Max")
print(round(df["r_sen"].mean(),6), round(df['r_sen'].std(), 6), df['r_sen'].median(), min(df["r_sen"]), max(df["r_sen"]))
print(round(df["r_sub"].mean(),6), round(df['r_sub'].std(), 6), df['r_sub'].median(), min(df["r_sub"]), max(df["r_sub"]))

Mean     SD       Median  Min  Max
0.182309 0.19267 0.180729 -1.0 1.0
0.444489 0.146888 0.441545 0.0 1.0


## readability

In [12]:
def readability_func(row):
    readability = textstat.flesch_reading_ease(row)
    return readability

In [13]:
# test on a subset
subset = text[:10000]
start_time = time.time()
readability = subset.apply(readability_func)
end_time = time.time()
#estimate time cost
print("--- %s seconds ---" % (end_time - start_time))
print("for 2576655 rows:", 2576655/10000 * (end_time - start_time)/60, "min")
readability

--- 1.7013099193572998 seconds ---
for 2576655 rows: 7.3061478504359725 min


0       81.59
1       76.15
2       70.43
3       78.59
4       77.06
        ...  
9995    71.44
9996    78.48
9997    73.00
9998    73.81
9999    90.29
Name: r_text, Length: 10000, dtype: float64

In [14]:
#apply funtcion to the whole data
readabilityw = text.apply(readability_func)

In [15]:
# add readability column to df
df["r_rea"] = round(readabilityw, 6)

In [17]:
#check value
print("Mean      SD        Median Min     Max")
print(round(df["r_rea"].mean(),6), round(df['r_rea'].std(), 6), df['r_rea'].median(), min(df["r_rea"]), max(df["r_rea"]))
df

Mean      SD        Median Min     Max
80.028846 11.169422 80.99 -2670.59 206.84


Unnamed: 0,r_id,b_id,u_id,r_stars,r_date,r_text,r_useful,r_funny,r_cool,b_stars,b_review_count,u_review_count,u_yelping_since,u_friends_count,r_sen,r_sub,r_rea
0,4,6317,164431,5,2015-01-04 00:01:03,"Wow! Yummy, different, delicious. Our favo...",1,0,1,4.0,181,9,2014-01-17 19:20:57,1,0.305871,0.775379,81.59
1,9,914,154297,3,2016-03-30 22:46:33,This easter instead of going to Lopez Lake we ...,1,1,0,4.5,13,24,2015-10-27 22:53:34,1,0.027222,0.558611,76.15
2,11,3877,35412,5,2015-06-21 14:48:06,My experience with Shalimar was nothing but wo...,2,0,0,2.5,8,39,2009-01-27 21:20:30,7,0.280583,0.489970,70.43
3,18,12041,21756,4,2014-08-10 19:41:43,The hubby and I have been here on multiple occ...,1,0,0,4.0,398,74,2009-07-24 14:30:28,112,0.130754,0.289683,78.59
4,19,295,80128,5,2016-03-07 00:02:18,I go to blow bar to get my brows done by natal...,2,0,1,4.0,55,27,2012-02-26 05:18:05,8,0.236861,0.368667,77.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576650,6990276,138213,246850,5,2014-12-17 21:45:20,Latest addition to services from ICCU is Apple...,1,2,1,2.5,36,327,2011-08-05 22:13:01,66,0.166667,0.300000,80.82
2576651,6990277,144721,1192959,5,2021-03-31 16:55:10,"This spot offers a great, affordable east week...",2,1,2,4.5,30,41,2011-10-18 13:28:02,22,0.152222,0.155000,89.28
2576652,6990278,149531,116946,4,2019-12-30 03:56:30,This Home Depot won me over when I needed to g...,1,0,0,2.5,32,326,2012-01-02 01:18:09,10,0.146032,0.237302,84.27
2576653,6990279,138952,199547,5,2022-01-19 18:59:27,For when I'm feeling like ignoring my calorie-...,1,0,0,4.5,83,281,2008-09-10 21:03:38,143,0.184919,0.332008,74.29


## everything else

In [18]:
# user rating squared -- exttremity of a rating
#add to df
df['r_stars_square'] = df['r_stars'].pow(2)
#check values
print("Mean      SD       Median Min Max")
print(round(df["r_stars_square"].mean(),6), round(df['r_stars_square'].std(), 6), df['r_stars_square'].median(), min(df["r_stars_square"]), max(df["r_stars_square"]))

Mean      SD       Median Min Max
15.381916 9.618801 16.0 1 25


In [19]:
#word_count ---length of a review
def word_count(row):
    row = TextBlob(row)
    return len(row.words)

In [20]:
#test on subset
subset = text[:10000]
start_time = time.time()
length = subset.apply(word_count)
end_time = time.time()
#time cost
print("--- %s seconds ---" % (end_time - start_time))
print("for 2576655 rows:", 2576655/10000 * (end_time - start_time)/60, "min")

--- 7.063380241394043 seconds ---
for 2576655 rows: 30.333156693148613 min


In [21]:
#apply to whole data
lengthw = text.apply(word_count)

In [25]:
# add column to df
df["r_length"] = lengthw
print(round(df["r_length"].mean(),6), round(df['r_length'].std(), 6), df['r_length'].median(), min(df["r_length"]), max(df["r_length"]))

133.269353 114.982187 101.0 0 1068


In [26]:
df

Unnamed: 0,r_id,b_id,u_id,r_stars,r_date,r_text,r_useful,r_funny,r_cool,b_stars,b_review_count,u_review_count,u_yelping_since,u_friends_count,r_sen,r_sub,r_rea,r_stars_square,r_length
0,4,6317,164431,5,2015-01-04 00:01:03,"Wow! Yummy, different, delicious. Our favo...",1,0,1,4.0,181,9,2014-01-17 19:20:57,1,0.305871,0.775379,81.59,25,43
1,9,914,154297,3,2016-03-30 22:46:33,This easter instead of going to Lopez Lake we ...,1,1,0,4.5,13,24,2015-10-27 22:53:34,1,0.027222,0.558611,76.15,9,103
2,11,3877,35412,5,2015-06-21 14:48:06,My experience with Shalimar was nothing but wo...,2,0,0,2.5,8,39,2009-01-27 21:20:30,7,0.280583,0.489970,70.43,25,179
3,18,12041,21756,4,2014-08-10 19:41:43,The hubby and I have been here on multiple occ...,1,0,0,4.0,398,74,2009-07-24 14:30:28,112,0.130754,0.289683,78.59,16,109
4,19,295,80128,5,2016-03-07 00:02:18,I go to blow bar to get my brows done by natal...,2,0,1,4.0,55,27,2012-02-26 05:18:05,8,0.236861,0.368667,77.06,25,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576650,6990276,138213,246850,5,2014-12-17 21:45:20,Latest addition to services from ICCU is Apple...,1,2,1,2.5,36,327,2011-08-05 22:13:01,66,0.166667,0.300000,80.82,25,63
2576651,6990277,144721,1192959,5,2021-03-31 16:55:10,"This spot offers a great, affordable east week...",2,1,2,4.5,30,41,2011-10-18 13:28:02,22,0.152222,0.155000,89.28,25,80
2576652,6990278,149531,116946,4,2019-12-30 03:56:30,This Home Depot won me over when I needed to g...,1,0,0,2.5,32,326,2012-01-02 01:18:09,10,0.146032,0.237302,84.27,16,88
2576653,6990279,138952,199547,5,2022-01-19 18:59:27,For when I'm feeling like ignoring my calorie-...,1,0,0,4.5,83,281,2008-09-10 21:03:38,143,0.184919,0.332008,74.29,25,385


In [27]:
#save file
# df[:2500000].to_parquet('joined_with_sen_sub.parquet.snappy', compression='snappy')

save_df = df[["r_id", "r_sen", "r_sub", "r_rea", "r_length", "r_stars_square"]]
save_df = save_df.astype({'r_id':'int'})
save_df
# save_df.dtypes

# df= df.astype({"r_id": "string", "r_text": "string"}, copy=False)
save_df.to_parquet('joined_linguistic_extra.parquet.snappy', compression='snappy', index=False)