In [None]:
!pip install tweet-preprocessor

In [3]:
import preprocessor as p
import pandas as pd
import json
import re

In [4]:
pd.options.display.max_colwidth = 75

### Turn query documents (tweets) into generators

In [115]:
def load_tweets(file):
    with open(file, 'r') as f:
        tweets = (json.loads(line) for line in f.readlines())
    return tweets

In [116]:
ls

[0m[01;34mgulf_timelines[0m/    stream_ุงูุทุฑูุฌ.jsonl  stream_ุฎุฑุจุฒ.jsonl   stream_ูุงููู.jsonl
stream_ุงุจุฎุต.jsonl  stream_ุชูููุญ.jsonl   stream_ูุบุณูุฉ.jsonl


In [154]:
query_gen = load_tweets('stream_ุงูุทุฑูุฌ.jsonl')

### Turn query generators into dataframes

In [155]:
def cleaner(message):
    message = re.sub("([0-9A-Za-z:!/.])", " ", message)
    return message

In [156]:
def tweet_to_df(tweets):
    data = {'text': [], 'screen_name': [], 'geo': [],
       }
    
    for t in tweets:
    
        data['text'].append(t['text'])
        data['screen_name'].append(t['user']['screen_name'])
        data['geo'].append(t['user']['location'])
    
    return pd.DataFrame(data)

In [157]:
query_df = tweet_to_df(query_gen)

In [158]:
query_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454 entries, 0 to 453
Data columns (total 3 columns):
geo            301 non-null object
screen_name    454 non-null object
text           454 non-null object
dtypes: object(3)
memory usage: 10.7+ KB


### Clean tweets (remove emojis, links, http, etc.)

In [159]:
cleaned_text = []

for tweet in query_df['text']:
    cleaned_text.append(p.clean(cleaner(tweet)))

query_df['cleaned_text']=cleaned_text

In [160]:
cleaned_name = []

for tweet in query_df['screen_name']:
    cleaned_name.append(p.clean(tweet))

query_df['cleaned_name']=cleaned_name

In [161]:
cleaned_geo = []

for tweet in query_df.geo:
    cleaned_geo.append(p.clean(str(tweet)))

query_df['cleaned_geo']=cleaned_geo


In [162]:
query_df.sample(4)

Unnamed: 0,geo,screen_name,text,cleaned_text,cleaned_name,cleaned_geo
291,Bahrain,sarajohn85,RT @FathiyaAjlan1: ุงูุง ุตูุช ุงููุฑูุฌ ูุจูุชู ุชูุงุฏูู\nุชุฏููู ุงููุฑุญ ูููู\nุงูุง ุซ...,ุงูุง ุตูุช ุงููุฑูุฌ ูุจูุชู ุชูุงุฏูู ุชุฏููู ุงููุฑุญ ูููู ุงูุง ุซูุจู ูุณุจุญุฉู ูู ูุฏู ูุงู...,sarajohn85,Bahrain
102,"Dubai, United Arab Emirates",zayynzeyad,ุฃูุณู ุจุงููู ูู ุตุงูุฑุฉ ูุฏุงูู ูุง ุงุจุฑูู ุนุฌูุจ ู ุงุชูุฑุฌ ุนู ููุธุฑ ู ุฃุฏุฎู ูุงููู ุฑู...,ุฃูุณู ุจุงููู ูู ุตุงูุฑุฉ ูุฏุงูู ูุง ุงุจุฑูู ุนุฌูุจ ู ุงุชูุฑุฌ ุนู ููุธุฑ ู ุฃุฏุฎู ูุงููู ุฑูุนุฉ,zayynzeyad,"Dubai, United Arab Emirates"
335,ุฏููุฉ ุงููููุช,mahmoud4325,@Almajlliss ุงูุญูู ุฃูุง ุฃุจู ุฃุนุฑู ุทูุงุจ ุงูุฌุงูุนุฉ ูุฑูุญูู ุจุดูู ูู ุจุณูุงุฑุฉ ูุนูู ...,ุงูุญูู ุฃูุง ุฃุจู ุฃุนุฑู ุทูุงุจ ุงูุฌุงูุนุฉ ูุฑูุญูู ุจุดูู ูู ุจุณูุงุฑุฉ ูุนูู ุฃููุฏ ุฑุงุญ ุชุณู...,mahmoud4325,ุฏููุฉ ุงููููุช
130,,dana_824,RT @FathiyaAjlan1: ุงูุง ุตูุช ุงููุฑูุฌ ูุจูุชู ุชูุงุฏูู\nุชุฏููู ุงููุฑุญ ูููู\nุงูุง ุซ...,ุงูุง ุตูุช ุงููุฑูุฌ ูุจูุชู ุชูุงุฏูู ุชุฏููู ุงููุฑุญ ูููู ุงูุง ุซูุจู ูุณุจุญุฉู ูู ูุฏู ูุงู...,dana_824,


### Remove duplicated tweets

In [163]:
query_df['cleaned_text'].duplicated().sample(5)

280    False
290     True
407     True
294    False
60      True
Name: cleaned_text, dtype: bool

In [164]:
query_cleaned = query_df.drop_duplicates(['cleaned_text'], keep='last')
query_cleaned = query_cleaned.drop(['screen_name', 'text', 'geo'], axis=1)

In [165]:
query_cleaned.shape

(230, 3)

In [166]:
query_cleaned.sample(5)

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
158,ุนุจุฏู ุงูุชุตููุฑ,RienMieuxQueMoi,Kuwait
280,ููุง ูุตูุช ูุต ุงูุทุฑูุฌ ุชุนุชุฐุฑุุุุุุุุ,isarah72,ุฏููุฉ ุงููููุช
429,ุงุฎูุงู ุณุชู ูููุง ููุดู ุจููุณ ุงูุทุฑูุฌ ๐ฆ๐ช๏ธ๐ง๐ญ๏ธ๐ธ๐ฆ๏ธ๐ด๐ฒ๏ธ๐ถ๐ฆ๏ธ๐ฐ๐ผ,someoneee04,ุฏููุฉ ุงููููุช
320,_ ุณูุงู ุนูููู ุฏูุชูุฑู ุจุงุฌุฑ ูุง ุฑุงุญ ุงูุฏุฑ ุงุญุถุฑ ุงูุดูุงุฑุน ูุณูุฑู ูุงูุง ุจูุชูุง ุจุตุจุง...,Rawoony_,Kuwait
224,ุงูู ุงููุญูุญูู ุงููู ุนุดุงู ููุถูููู ุงูุทุฑูุฌ ุจุนุฏ,sarasaladwani,


In [167]:
query_cleaned.cleaned_geo.value_counts().head()

None             78
Kuwait           26
ุฏููุฉ ุงููููุช      13
kuwait            9
Salwa, Kuwait     8
Name: cleaned_geo, dtype: int64

In [168]:
query_cleaned.cleaned_name.value_counts().head()

Fahad_alenze     7
Nora_buhamad     4
savagecoffe_t    3
BatoolALQ_       2
mi3ado_1989      2
Name: cleaned_name, dtype: int64

In [169]:
query_cleaned.fillna('None')

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
6,_ ุนูู ุงูุงูู ุงุฑุญู ูู ุงู ุชุชุนูุฏ ุนูู ูุฌูุฏ ุงุญุฏ ููุบุฑูู ุงูุชูุงู ูุญุจ ููุบููู ุนู ุง...,typicallynorah,United Arab Emirates
7,ูุญุฏุฉ ููุงูุญุฉ ุงูููุฒ ูู ุงูุทุฑูุฌ,Hasan95780025,Dokhaa
17,ุงูู ูุณุงูุฉ ุงูุทุฑูุฌ ูุงูุชูฺ โ๏ธโ๏ธโ๏ธ,hodaaa_q8,
18,_ ููุงู ุฌููู ููู ููู ุงูุชุทุจูู ุทูุน ูุงููู ููุฎุฏู ููุดุฑ ุจุงูุฌุฑูุฏุฉ ุงูุฑุณููุฉ ูู ุฒู...,DzY9d,Kuwait
22,ุงูุซุงูุซ ุนูู ุงูุทุฑูุฌ ุทููู ุจุงูุฌ ุงูู ุงููู๏ธ,ZuSX6cXU62tZPUW,
26,ููุง ุฃููู ูุณุชุนููู ููุบูุท ุงูุณูุงู ู ุงูุทุฑูุฌ,R__Aey,Konoha
30,ุงุญุจู ูุซุฑ ูุงุชููู ุนูู ุฌูุจ ุงูุทุฑูุฌ ุงุดุฌุงุฑ ุงุญุจู ูุซุฑ ูุงููุณู ุงูุดุชุง ูุชูุงุฌุฑ ุทููุฑู,almutairi43__,
37,ุญุฏุฉ ุญุฏุฉ,_fatmaa114,Kuwait
39,"ูุญุฏ ูุณููู ุงู ูููููู ุจุณุฌ ุชููููู ุจูุต ุงูุงุบููู, ุงูุทุฑูุฌ ุฏูุงูู ูุงูุง ุนูุฏู ุงุบูู...",ig47j,upside down
42,ูุงุดูุช ูุงูุดู ุดูู ูุฐุง,f36s_x,


In [170]:
query_cleaned.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 230 entries, 6 to 453
Data columns (total 3 columns):
cleaned_text    230 non-null object
cleaned_name    230 non-null object
cleaned_geo     230 non-null object
dtypes: object(3)
memory usage: 7.2+ KB


### Consolidate different geo tags

In [103]:
locations = {'Saudi Arabia': ['Kingdom of Saudi Arabia', 'MAKKAH', 'saudi arabia', 'jeddah', 'ููุฉ ุงูููุฑูุฉ', 'ุงูููููุฉ ุงูุนุฑุจูุฉ', 'Jeddah', 'Riyadh', 'ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'jeddah , saudi arabia', 'ุงูุฑูุงุถ', 'ุขููุตูู ุ ุจุฑูุฏู', 'ุฌุฏุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงููุฏููุฉ ุงููููุฑุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'Riyadh, Kingdom of Saudi Arabia', 'makkah - saudi arabia', 'ุฌุฏู', 'Dammam', 'Dammam, Eastern', 'Al Khobar, Kingdom of Saudi Arabia'],
             'Kuwait': ['ุงููููุช', 'Alkuwait', 'kuwait', 'KUWAIT', 'Kuwait , Salwa', 'Qortuba, Kuwait', 'ุฏููุฉ ุงููููุช', 'ุงูุนุงุตูู, ุฏููุฉ ุงููููุช', 'The Capital, Kuwait', 'Al Salam, Kuwait', 'Salwa, Kuwait', 'ูููุจ ุงูุดุฑู - Kuwait']
            }

In [104]:
'MAKKAH' in list(locations.values())[0]

True

In [105]:
'MAKKAH' in list(locations.values())[1]

False

In [106]:
locations.values()

dict_values([['Kingdom of Saudi Arabia', 'MAKKAH', 'saudi arabia', 'jeddah', 'ููุฉ ุงูููุฑูุฉ', 'ุงูููููุฉ ุงูุนุฑุจูุฉ', 'Jeddah', 'Riyadh', 'ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'jeddah , saudi arabia', 'ุงูุฑูุงุถ', 'ุขููุตูู ุ ุจุฑูุฏู', 'ุฌุฏุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงููุฏููุฉ ุงููููุฑุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'Riyadh, Kingdom of Saudi Arabia', 'makkah - saudi arabia', 'ุฌุฏู', 'Dammam', 'Dammam, Eastern', 'Al Khobar, Kingdom of Saudi Arabia'], ['ุงููููุช', 'Alkuwait', 'kuwait', 'KUWAIT', 'Kuwait , Salwa', 'Qortuba, Kuwait', 'ุฏููุฉ ุงููููุช', 'ุงูุนุงุตูู, ุฏููุฉ ุงููููุช', 'The Capital, Kuwait', 'Al Salam, Kuwait', 'Salwa, Kuwait', 'ูููุจ ุงูุดุฑู - Kuwait']])

In [107]:
#pseudo code

for word in query_cleaned['cleaned_geo'].str.split('_'):
    if word[0] in list(locations.values())[0]:
        query_cleaned['cleaned_geo'].replace(word, 'Saudi Arabia', inplace=True)
    else:
        if word[0] in list(locations.values())[1]:
            query_cleaned['cleaned_geo'].replace(word, 'Kuwait', inplace=True)

In [108]:
query_cleaned.cleaned_geo

6                                 United Arab Emirates
7                                               Dokhaa
8                                                 None
17                                                None
18                                              Kuwait
22                                                None
26                                              Konoha
30                                                None
34                                                None
35                                                None
37                                              Kuwait
39                                         upside down
42                                                None
46                                         ูุฑุงุญ ุงููุงูุฏ
48                                              Hateen
51                                              Kuwait
52                                                None
53                                       ููุซุงุฑุฉ ุ

In [109]:
query_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 6 to 453
Data columns (total 3 columns):
cleaned_text    250 non-null object
cleaned_name    250 non-null object
cleaned_geo     250 non-null object
dtypes: object(3)
memory usage: 7.8+ KB


In [187]:
query_cleaned.sample(5)

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
308,ุงููุฑูุงููู ุงูููุตุฏ ูู ุงููุญุงูุธู ุงูููุทูู ูุงูู ูู ุงูุทุฑูุฌ ููู ุฎูุทุงู,MmrMmr41,
280,ููุง ูุตูุช ูุต ุงูุทุฑูุฌ ุชุนุชุฐุฑุุุุุุุุ,isarah72,ุฏููุฉ ุงููููุช
55,ุงููู ูุญููฺ ููุง ูุณุงูุฉ ุงูุทุฑูุฌ ูุฃููู ุนูุฏฺ,subzoe,
203,ุงูุซุฑ ุดู ููุฑูุฒ ููู ุงููู ุทุงูุนู ุจุฑูุญ ููุงุณ ู ุจูุต ุงูุทุฑูุฌ ุชุนุชุฐุฑ,Masha3ell_,Kuwait - Bahrain
231,ุจุนุฏู ูู ุงูุทุฑูุฌ ูุงูู ูุดุฑุจ ูุฑู,AishaSalem77,


### Pickle to conduct EDA in seperate notebook

In [111]:
pwd

'/home/jovyan/capstone-35/gulf_twitter_raw'

In [112]:
query_cleaned.to_pickle('../gulf_twitter_pickled/stream_ุงูุทุฑูุฌ.p')