In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob, os, re, string
from datetime import datetime
from dateutil.parser import parse
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from polyglot.detect import Detector
from wordcloud import WordCloud

In [8]:
FDIR = 'data/unclean/'

files = [file for file in glob.glob(FDIR + '*.csv')]
files

['data/unclean/twitter_phrase_taxi-booking.csv',
 'data/unclean/twitter_referral_grab-referral-code.csv',
 'data/unclean/twitter_str_grabcar.csv',
 'data/unclean/twitter_referral_grab-reward-points.csv',
 'data/unclean/play_grab.csv',
 'data/unclean/twitter_referral_grabrewards.csv',
 'data/unclean/twitter_others_Grabeats.csv',
 'data/unclean/gplay_grab.csv',
 'data/unclean/twitter_others_Grabpay.csv',
 'data/unclean/twitter_phrase_ride-cancellation.csv',
 'data/unclean/twitter_phrase_grabeats-grabpay.csv',
 'data/unclean/twitter_str_grabhitch.csv',
 'data/unclean/appstore_tada.csv',
 'data/unclean/appstore_grab.csv',
 'data/unclean/twitter_str_ride-hailing.csv',
 'data/unclean/reddit_merged.csv',
 'data/unclean/twitter_referral_grab-promo-codes.csv',
 'data/unclean/twitter_str_grabshare.csv',
 'data/unclean/play_tada.csv',
 'data/unclean/reddit_gojek_comments.csv',
 'data/unclean/twitter_referral_grab-refer-code.csv',
 'data/unclean/twitter_phrase_grabfoods.csv',
 'data/unclean/twitte

In [9]:
prefixes = ['fb', 'twitter', 'gplay']

files_grp = [[f for f in files if p in f] for p in prefixes]

files_grp

[['data/unclean/fb_groups.csv'],
 ['data/unclean/twitter_phrase_taxi-booking.csv',
  'data/unclean/twitter_referral_grab-referral-code.csv',
  'data/unclean/twitter_str_grabcar.csv',
  'data/unclean/twitter_referral_grab-reward-points.csv',
  'data/unclean/twitter_referral_grabrewards.csv',
  'data/unclean/twitter_others_Grabeats.csv',
  'data/unclean/twitter_others_Grabpay.csv',
  'data/unclean/twitter_phrase_ride-cancellation.csv',
  'data/unclean/twitter_phrase_grabeats-grabpay.csv',
  'data/unclean/twitter_str_grabhitch.csv',
  'data/unclean/twitter_str_ride-hailing.csv',
  'data/unclean/twitter_referral_grab-promo-codes.csv',
  'data/unclean/twitter_str_grabshare.csv',
  'data/unclean/twitter_referral_grab-refer-code.csv',
  'data/unclean/twitter_phrase_grabfoods.csv',
  'data/unclean/twitter_str_ryde.csv',
  'data/unclean/twitter_str_ridesharing.csv',
  'data/unclean/twitter_referral_grab-promo.csv',
  'data/unclean/twitter_phrase_grab-taxi-ride.csv',
  'data/unclean/twitter_othe

In [2]:
def add_lang(df, col_name):
    detect_lang = lambda comment: Detector(str(comment), quiet=True).languages[0]

    df['lang_code'] = df.apply(lambda row: detect_lang(row[col_name]).code, axis=1)
    df['lang_sig'] = df.apply(lambda row: detect_lang(row[col_name]).confidence, axis=1)
    
    return df

In [3]:
def analyse_sentiment_vader(df, col_name):
    sid = SentimentIntensityAnalyzer()
    vader = lambda text: sid.polarity_scores(text)
    
    df['vader'] = df[col_name].apply(vader)
    df = pd.merge(df, df['vader'].apply(pd.Series), left_index=True, right_index=True)
    return df.drop(['vader'], axis=1)

In [19]:
df_fb = pd.read_csv('data/ca_unclean/grab/fb_hitchsg.csv',names=['url','date','comment'])
df_fb['date'] = df_fb['date'].apply(lambda x: parse(str(x)))
df_fb = df_fb[['date','comment']]
df_fb.head(20)

Unnamed: 0,date,comment
0,2018-11-17 18:00:00,"There still be ego,cheapo riders who think 5 c..."
1,2018-11-17 18:00:00,there will be ego people around... just pray u...
2,2018-11-17 18:00:00,Till today there are riders that think as long...
3,2018-11-16 15:13:00,Thk you for sharing n showing what all need to...
4,2018-11-16 15:13:00,Martin Lau
5,2018-11-16 15:13:00,Nice article. Perhaps Grab should incentivise ...
6,2018-11-17 18:00:00,yishun???
7,2018-11-17 18:00:00,That's provided there is a driver who wants to...
8,2018-11-17 18:00:00,$5 still wanna share..
9,2018-11-17 18:00:00,Cheapo to the max


In [20]:
df_fb = add_lang(df_fb, 'comment')
df_fb.head(10)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Unnamed: 0,date,comment,lang_code,lang_sig
0,2018-11-17 18:00:00,"There still be ego,cheapo riders who think 5 c...",en,99.0
1,2018-11-17 18:00:00,there will be ego people around... just pray u...,en,98.0
2,2018-11-17 18:00:00,Till today there are riders that think as long...,en,99.0
3,2018-11-16 15:13:00,Thk you for sharing n showing what all need to...,en,98.0
4,2018-11-16 15:13:00,Martin Lau,to,91.0
5,2018-11-16 15:13:00,Nice article. Perhaps Grab should incentivise ...,en,99.0
6,2018-11-17 18:00:00,yishun???,qu,87.0
7,2018-11-17 18:00:00,That's provided there is a driver who wants to...,en,98.0
8,2018-11-17 18:00:00,$5 still wanna share..,en,94.0
9,2018-11-17 18:00:00,Cheapo to the max,en,94.0


In [22]:
df_fb[df_fb['lang_code'] == 'en'].to_csv('data/ca_cleaned/grab/fb_grab_hitchsg.csv')

In [26]:
df_fb.dtypes

date         datetime64[ns]
comment              object
lang_code            object
lang_sig            float64
dtype: object

In [30]:
df_fb[df_fb['date'] > '2018-11-16']

Unnamed: 0,date,comment,lang_code,lang_sig
0,2018-11-17 18:00:00,"There still be ego,cheapo riders who think 5 c...",en,99.0
1,2018-11-17 18:00:00,there will be ego people around... just pray u...,en,98.0
2,2018-11-17 18:00:00,Till today there are riders that think as long...,en,99.0
3,2018-11-16 15:13:00,Thk you for sharing n showing what all need to...,en,98.0
4,2018-11-16 15:13:00,Martin Lau,to,91.0
5,2018-11-16 15:13:00,Nice article. Perhaps Grab should incentivise ...,en,99.0
6,2018-11-17 18:00:00,yishun???,qu,87.0
7,2018-11-17 18:00:00,That's provided there is a driver who wants to...,en,98.0
8,2018-11-17 18:00:00,$5 still wanna share..,en,94.0
9,2018-11-17 18:00:00,Cheapo to the max,en,94.0


In [9]:
df_fb = add_lang(df_fb, 'comment')
df_fb = analyse_sentiment_vader(df_fb[df_fb['lang_code'] == 'en'], 'comment')

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [10]:
df_fb.head(20)

Unnamed: 0,url,date,comment,lang_code,lang_sig,neg,neu,pos,compound
0,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,"There still be ego,cheapo riders who think 5 c...",en,99.0,0.0,0.832,0.168,0.6652
1,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,there will be ego people around... just pray u...,en,98.0,0.0,0.839,0.161,0.3182
2,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,Till today there are riders that think as long...,en,99.0,0.104,0.896,0.0,-0.2023
3,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-16 15:13:00,Thk you for sharing n showing what all need to...,en,98.0,0.0,0.639,0.361,0.7767
5,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-16 15:13:00,Nice article. Perhaps Grab should incentivise ...,en,99.0,0.0,0.769,0.231,0.8126
7,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,That's provided there is a driver who wants to...,en,98.0,0.0,1.0,0.0,0.0
8,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,$5 still wanna share..,en,94.0,0.0,1.0,0.0,0.0
9,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,Cheapo to the max,en,94.0,0.0,1.0,0.0,0.0
10,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,Knn micron staff all very rich yet so cheapo,en,97.0,0.0,0.673,0.327,0.5984
11,https://m.facebook.com/groups/hitchsg?view=per...,2018-11-17 18:00:00,Blacklist this cheapskate,en,96.0,0.0,1.0,0.0,0.0


In [14]:
df_fb = df_fb[['date', 'comment']]
df_fb['source'] = 'fb'
df_fb.to_csv('data/ca_cleaned/grab/fb_grab_hitchsg.csv')

In [15]:
df_comb = [pd.read_csv('data/ca_cleaned/grab_consolidated.csv'), pd.read_csv('data/ca_cleaned/grab/fb_grab_hitchsg.csv')]
df_new = pd.concat(df_comb)

In [16]:
df_new.head(25)

Unnamed: 0.1,Unnamed: 0,date,comment,source
0,27,2018-11-02,More flexible for grabpay and top up,gplay
1,34,2018-11-02,"useful, easy to use, discount everyday,good se...",twitter
2,36,2018-11-02,Thanks,twitter
3,37,2018-11-02,Nice grab,twitter
4,38,2018-11-02,Good,twitter
5,39,2018-11-02,Grab has made my trips more enjoyable and stre...,twitter
6,40,2018-11-02,So far the service are good,twitter
7,41,2018-11-02,Thanks,twitter
8,42,2018-11-02,After I change the phone and reinstall again u...,twitter
9,43,2018-11-02,the fees is getting expensive！dont make ppl ha...,twitter


In [17]:
df_new.to_csv('data/ca_cleaned/grab_consolidated.csv')

In [None]:
new_dir = 'data/en'

if not os.path.exists(new_dir):
    os.makedirs(new_dir)

In [11]:
files_grp[2][0][13:]

'gplay_grab.csv'

In [12]:
df_gp_grps = []

for f in files_grp[2]:
    gp_head = ['name', 'date', 'rating', 'comment']
    df = pd.read_csv(f, names=gp_head, encoding='utf-8')
    df['date'] = pd.to_datetime(df['date'])
    
    df = add_lang(df, 'comment')
    df['source'] = f[13:]
    df_gp_grps.append(df)

df_gp = pd.concat(df_gp_grps)
df_gp

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Unnamed: 0,name,date,rating,comment,lang_code,lang_sig,source
0,A Google user,2018-11-02,5,Very usefull n advantages,en,96.0,gplay_grab.csv
1,A Google user,2018-11-02,1,"Registry / login with a number, I am using my ...",en,99.0,gplay_grab.csv
2,Kumar Subramanian,2018-11-02,5,I think it is a excellent app 😀😀.,en,96.0,gplay_grab.csv
3,A Google user,2018-11-02,5,Good application,en,94.0,gplay_grab.csv
4,Zaky Mochamad,2018-11-02,2,You don't imagine how irritated when you alrea...,en,99.0,gplay_grab.csv
5,Bernadette Alciso,2018-11-02,1,Slow connection for location,en,96.0,gplay_grab.csv
6,Bernd Bsser,2018-11-02,1,Your old version was much more customerfriendl...,en,99.0,gplay_grab.csv
7,suryadi taufan,2018-11-02,5,Good app,en,90.0,gplay_grab.csv
8,A Google user,2018-11-02,5,My favorite online transportation,en,97.0,gplay_grab.csv
9,ChenHoong Koon,2018-11-02,1,No drivers whenever suck chair,en,96.0,gplay_grab.csv


In [18]:
df_tw_grps = []

for f in files_grp[1]:
    df = pd.read_csv(f)
    df['tweet'] = df['tweet'].apply(lambda row: ''.join([x for x in str(row) if x in string.printable]))
    df = add_lang(df, 'tweet')
    df['source'] = f[13:]
    df_tw_grps.append(df)

df_tw = pd.concat(df_tw_grps)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [16]:
df_fb = pd.read_csv(files_grp[0][0], encoding='utf-8')
df_fb = add_lang(df_fb, 'comment')
df_fb['source'] = f[13:]

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [20]:
df_lang_grps = [df_gp[['lang_code','lang_sig']],df_tw[['lang_code','lang_sig']],df_fb[['lang_code','lang_sig']]]

df_lang = pd.concat(df_lang_grps)
df_lang.head(20)

Unnamed: 0,lang_code,lang_sig
0,en,96.0
1,en,99.0
2,en,96.0
3,en,94.0
4,en,99.0
5,en,96.0
6,en,99.0
7,en,90.0
8,en,97.0
9,en,96.0


In [21]:
df_lang.shape

(123080, 2)

In [28]:
df_lang['lang_code'].value_counts()

en         92476
id         14798
ms          4725
tl          3643
ht          1927
un          1126
nl           637
vi           267
da           252
de           215
ceb          165
sk           117
jw           117
fr            90
th            88
nn            86
aa            83
lt            83
ga            83
es            81
sco           78
kha           77
sw            68
war           56
so            53
tlh           53
sv            52
ts            50
su            48
br            47
           ...  
ig             8
kl             8
oc             8
mfe            7
ny             7
crs            7
vo             7
mi             6
ja             6
km             6
nso            6
zh_Hant        5
co             5
fo             5
ca             4
xx_Qaai        4
ar             4
az             3
ss             2
ik             2
bs             2
ru             2
el             1
ne             1
sd             1
ko             1
ta             1
zu            

In [5]:
for f in files_gp:
    gp_head = ['name', 'date', 'rating', 'comment']
    df = pd.read_csv(f, names=gp_head, encoding='utf-8')
    df['date'] = pd.to_datetime(df['date'])
    
    df = add_lang(df, 'comment')
    
    df[df['lang_code'] == 'en'].to_csv(new_dir + '/' + f[5:], encoding='utf-8')

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [19]:
for f in files_tw:
    df = pd.read_csv(f)
    df = add_lang(df, 'tweet')
    
    df[df['lang_code'] == 'en'].to_csv(new_dir + '/' + f[5:])

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

error: ('input contains invalid UTF-8 around byte 44 (of 78)', 'occurred at index 904')

In [28]:
for f in files_fb:
    df = pd.read_csv(f, encoding='utf-8')
    df = add_lang(df, 'comment')
    
    df[df['lang_code'] == 'en'].to_csv(new_dir + '/' + f[5:], encoding='utf-8')

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [29]:
def process_google_play(f):
    gp_head = ['name', 'date', 'rating', 'comment']
    df = pd.read_csv(f, names=gp_head, encoding='utf-8')
    df['date'] = pd.to_datetime(df['date'])
    df['source'] = f[5:]
    
    return add_lang(df, 'comment')

gp_list = [process_google_play(f) for f in files_gp]

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

AttributeError: 'list' object has no attribute 'concat'

In [30]:
gp_all = pd.concat(gp_list)
gp_all.to_csv(new_dir + '/' + 'gplay_consolidated.csv')
gp_all[gp_all['lang_code'] == 'en'].to_csv(new_dir + '/' + 'gplay_consolidated_en.csv')

In [5]:
def process_twitter(f):
    df = pd.read_csv(f, encoding='iso-8859-1')
    return add_lang(df, 'tweet')

tw_list = [process_twitter(f) for f in files_tw]

error: ('input contains invalid UTF-8 around byte 2 (of 173)', 'occurred at index 3')

In [37]:
tw_all = pd.concat(tw_list)
tw_all.to_csv(new_dir + '/' + 'twitter_consolidated.csv')
tw_all[tw_all['lang_code'] == 'en'].to_csv(new_dir + '/' + 'twitter_consolidated_en.csv')

NameError: name 'tw_list' is not defined