In [None]:
!pip install tweet-preprocessor

In [1]:
import preprocessor as p
import pandas as pd
import json
import re

In [3]:
pd.options.display.max_colwidth = 75

### Turn query documents (tweets) into generators

In [2]:
def load_tweets(file):
    with open(file, 'r') as f:
        tweets = (json.loads(line) for line in f.readlines())
    return tweets

In [9]:
ls

stream_ุงุจุฎุต.jsonl  stream_ุงูุทุฑูุฌ.jsonl  stream_ุฎุฑุจุฒ.jsonl  stream_ูุบุณูุฉ.jsonl


In [50]:
query_gen = load_tweets('stream_ุงูุทุฑูุฌ.jsonl')

### Turn query generators into dataframes

In [51]:
def tweet_to_df(tweets):
    data = {'text': [], 'screen_name': [], 'geo': [],
       }
    
    for t in tweets:
    
        data['text'].append(t['text'])
        data['screen_name'].append(t['user']['screen_name'])
        data['geo'].append(t['user']['location'])
    
    return pd.DataFrame(data)

In [52]:
query_df = tweet_to_df(query_gen)

In [53]:
query_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 3 columns):
geo            231 non-null object
screen_name    354 non-null object
text           354 non-null object
dtypes: object(3)
memory usage: 8.4+ KB


### Clean tweets (remove emojis, links, http, etc.)

In [54]:
cleaned_text = []

for tweet in query_df['text']:
    cleaned_text.append(p.clean(tweet)) 

query_df['cleaned_text']=cleaned_text

In [55]:
cleaned_name = []

for tweet in query_df['screen_name']:
    cleaned_name.append(p.clean(tweet))

query_df['cleaned_name']=cleaned_name

In [56]:
cleaned_geo = []

for tweet in query_df.geo:
    cleaned_geo.append(p.clean(str(tweet)))

query_df['cleaned_geo']=cleaned_geo


In [57]:
query_df.sample(4)

Unnamed: 0,geo,screen_name,text,cleaned_text,cleaned_name,cleaned_geo
319,Kuwait,h73849092,ูุงููุงุนู ุงูุทุฑูุฌ ุจูุฑู ุตูุจ ุญููู ูุณูุฑุุ,ูุงููุงุนู ุงูุทุฑูุฌ ุจูุฑู ุตูุจ ุญููู ูุณูุฑุุ,h73849092,Kuwait
322,ุงููููุช,abotalal71,@shacaleta1 ุดุงุทุฑู ูุง ุนุทูุงุช .... ุงููุถุน ูุง ุตูุงุฑู...,ุดุงุทุฑู ูุง ุนุทูุงุช .... ุงููุถุน ูุง ุตูุงุฑูุฎ ููุง ุบูุฑู ู...,abotalal71,ุงููููุช
242,,Ai2x1,@i29xx_ ุฃู ูุทูู ุงูุทุฑูุฌ ุฃุฏุนู ุนุงูู ูุนุทูุฌ ุงูููุณู ๐,ุฃู ูุทูู ุงูุทุฑูุฌ ุฃุฏุนู ุนุงูู ูุนุทูุฌ ุงูููุณู,Ai2x1,
332,,DeviantFatma,ุณููููุนุงู\nุงูุทุฑูุฌ ุญู ุดููุฎ ุจุงุฌุฑ ูุซู ูู ููู ููุงุ,ุณููููุนุงู ุงูุทุฑูุฌ ุญู ุดููุฎ ุจุงุฌุฑ ูุซู ูู ููู ููุงุ,DeviantFatma,


### Remove duplicated tweets

In [58]:
query_df['cleaned_text'].duplicated().sample(5)

238    False
73      True
141    False
312    False
138     True
Name: cleaned_text, dtype: bool

In [59]:
query_cleaned = query_df.drop_duplicates(['cleaned_text'], keep='last')
query_cleaned = query_cleaned.drop(['screen_name', 'text', 'geo'], axis=1)

In [60]:
query_cleaned.shape

(207, 3)

In [61]:
query_cleaned.sample(5)

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
89,: โข ุฎุงุทุฑู ุทุงุจ ูู ุฐุงู ุงููุฑูุฌ ุตูุฏ ูููุจู ูุงููุง ุชู...,wael_alyaseen,
313,ูุง ุฑุงุญ ุงุฏุงูู ุจุณ ุงูุง ุนูู ุฑุงุณู ููุดู ููุชุญูู ูู ุงู...,MHA25_,ุนุงููู ุฏููู ุตุบูุฑ
263,ูู ุฒุนููุุจุนูููู ุงูุณูุฏ ูุธุฑุฉ ูุง ุดูุชูุง ุญุณูุช ุจุงูุญุฒู...,Maysonalsaleh84,
273,ูุง ุงุฑูู ... ุงูุทุฑูุฌ ูุถููู ุงูุฎููู,HKHALAWI,ุงููู ุซู ุงููุทู ุซู ุฑุฆูุณ ุงูุฏููุฉ
321,ุงูููู ูุงููู ูุงูู ููุณ ุงูููุฑู ุงูุญูู ุนุทูู ุงูููุฏุงุฑ...,baadryaa,KUWAIT


In [62]:
query_cleaned.cleaned_geo.value_counts().head()

None             72
Kuwait           19
ุฏููุฉ ุงููููุช      12
Salwa, Kuwait     8
kuwait            7
Name: cleaned_geo, dtype: int64

In [63]:
query_cleaned.cleaned_name.value_counts().head()

Fahad_alenze      7
Nora_buhamad      4
savagecoffe_t     3
hadeelnasser34    2
albraikiq8        2
Name: cleaned_name, dtype: int64

In [64]:
query_cleaned.fillna('None')

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
6,: ุนูู ุงูุงูู ุงุฑุญู ูู ุงู ุชุชุนูุฏ ุนูู ูุฌูุฏ ุงุญุฏ ููุบุฑ...,typicallynorah,United Arab Emirates
7,ูุญุฏุฉ ููุงูุญุฉ ุงูููุฒ ูู ุงูุทุฑูุฌ,Hasan95780025,Dokhaa
8,ุงูุง ุตูุช ุงููุฑูุฌ ูุจูุชู ุชูุงุฏูู ุชุฏููู ุงููุฑุญ ูููู ุง...,FathiyaAjlan1,
17,ุงูู ูุณุงูุฉ ุงูุทุฑูุฌ ูุงูุชูฺ โ๏ธโ๏ธโ๏ธ,hodaaa_q8,
18,ููุงู ุฌููู ููู ููู ุงูุชุทุจูู.ุทูุน ูุงููู ููุฎุฏู ููุดุฑ...,DzY9d,Kuwait
22,ุงูุซุงูุซ ุนูู ุงูุทุฑูุฌ ุทููู ุจุงูุฌ ุงูู ุงููู๏ธ,ZuSX6cXU62tZPUW,
26,ููุง ุฃููู ูุณุชุนููู ููุบูุท ุงูุณูุงู ู ุงูุทุฑูุฌ,R__Aey,Konoha
30,ุงุญุจู ูุซุฑ ูุงุชููู ุนูู ุฌูุจ ุงูุทุฑูุฌ ุงุดุฌุงุฑ ุงุญุจู ูุซุฑ ...,almutairi43__,
34,"ูุญุฏ ูุณููู ุงูAUX ูููููู ุจุณุฌ ุชููููู ุจูุต ุงูุงุบููู,...",skullxcrusherx,
35,ุงูุง ูุธุงูู ุงุฎูููู ูุณูุนูู ุงูู ุฏูุงูู ูุงูู ุจุนุฏู,_Alsh67,


In [65]:
query_cleaned.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207 entries, 6 to 353
Data columns (total 3 columns):
cleaned_text    207 non-null object
cleaned_name    207 non-null object
cleaned_geo     207 non-null object
dtypes: object(3)
memory usage: 6.5+ KB


### Consolidate different geo tags

In [80]:
locations = {'Saudi Arabia': ['Kingdom of Saudi Arabia', 'MAKKAH', 'saudi arabia', 'jeddah', 'ููุฉ ุงูููุฑูุฉ', 'ุงูููููุฉ ุงูุนุฑุจูุฉ', 'Jeddah', 'Riyadh', 'ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'jeddah , saudi arabia', 'ุงูุฑูุงุถ', 'ุขููุตูู ุ ุจุฑูุฏู', 'ุฌุฏุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงููุฏููุฉ ุงููููุฑุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'Riyadh, Kingdom of Saudi Arabia', 'makkah - saudi arabia', 'ุฌุฏู', 'Dammam', 'Dammam, Eastern', 'Al Khobar, Kingdom of Saudi Arabia'],
             'Kuwait': ['ุงููููุช', 'Alkuwait', 'kuwait', 'KUWAIT', 'Kuwait , Salwa', 'Qortuba, Kuwait', 'ุฏููุฉ ุงููููุช', 'ุงูุนุงุตูู, ุฏููุฉ ุงููููุช', 'The Capital, Kuwait', 'Al Salam, Kuwait', 'Salwa, Kuwait', 'ูููุจ ุงูุดุฑู - Kuwait']
            }

In [81]:
'MAKKAH' in list(locations.values())[0]

True

In [82]:
'MAKKAH' in list(locations.values())[1]

False

In [83]:
locations.values()

dict_values([['Kingdom of Saudi Arabia', 'MAKKAH', 'saudi arabia', 'jeddah', 'ููุฉ ุงูููุฑูุฉ', 'ุงูููููุฉ ุงูุนุฑุจูุฉ', 'Jeddah', 'Riyadh', 'ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'jeddah , saudi arabia', 'ุงูุฑูุงุถ', 'ุขููุตูู ุ ุจุฑูุฏู', 'ุฌุฏุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงููุฏููุฉ ุงููููุฑุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'Riyadh, Kingdom of Saudi Arabia', 'makkah - saudi arabia', 'ุฌุฏู', 'Dammam', 'Dammam, Eastern', 'Al Khobar, Kingdom of Saudi Arabia'], ['ุงููููุช', 'Alkuwait', 'kuwait', 'KUWAIT', 'Kuwait , Salwa', 'Qortuba, Kuwait', 'ุฏููุฉ ุงููููุช', 'ุงูุนุงุตูู, ุฏููุฉ ุงููููุช', 'The Capital, Kuwait', 'Al Salam, Kuwait', 'Salwa, Kuwait', 'ูููุจ ุงูุดุฑู - Kuwait']])

In [90]:
#pseudo code

for word in query_cleaned['cleaned_geo'].str.split('_'):
    if word[0] in list(locations.values())[0]:
        query_cleaned['cleaned_geo'].replace(word, 'Saudi Arabia', inplace=True)
    else:
        if word[0] in list(locations.values())[1]:
            query_cleaned['cleaned_geo'].replace(word, 'Kuwait', inplace=True)

In [91]:
query_cleaned.cleaned_geo

6        United Arab Emirates
7                      Dokhaa
8                        None
17                       None
18                     Kuwait
22                       None
26                     Konoha
30                       None
34                       None
35                       None
37                     Kuwait
39                upside down
42                       None
46                ูุฑุงุญ ุงููุงูุฏ
48                     Hateen
51                     Kuwait
52                       None
53              ููุซุงุฑุฉ ุงูุดูุฑู
55                       None
66                       None
67                       None
68                       None
76                     Kuwait
78                     Kuwait
80                     Kuwait
82                     Kuwait
83                     Kuwait
89                       None
97               Saudi Arabia
98               Saudi Arabia
                ...          
320                    Kuwait
321               

In [86]:
query_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 207 entries, 6 to 353
Data columns (total 3 columns):
cleaned_text    207 non-null object
cleaned_name    207 non-null object
cleaned_geo     207 non-null object
dtypes: object(3)
memory usage: 6.5+ KB


In [87]:
query_cleaned.sample(5)

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
303,ุจูุณูุฑูู ุงูุทุฑูุฌ ุนุดุงู ูุคุชูุฑ ุงูููุฉ,om_zeena_93,
279,: ุญูุงูู ุงููู .. ูุงู ุดุงุก ุงููู ุชููู ุงูุฃููุฑ ุนูู ู...,MaherAlMalalha,Kuwait
137,: ูู ุตุฌุฌ ุงุญุณ ุจููุฒููู ุจูุต ุงูุทุฑูุฌ,m_memoshi,Kuwait
220,ุฑุงุญ ุงูุชูู ูุงู ุงูุทุฑูุฌ ุจุนูุฏ ุจุณ ูุง ุงุฑุฏ ุญูุงุก ๏ธ,al3baidly656,
89,: โข ุฎุงุทุฑู ุทุงุจ ูู ุฐุงู ุงููุฑูุฌ ุตูุฏ ูููุจู ูุงููุง ุชู...,wael_alyaseen,


### Pickle to conduct EDA in seperate notebook

In [88]:
pwd

'/home/jovyan/capstone/gulf_twitter_raw'

In [89]:
query_cleaned.to_pickle('../gulf_twitter_pickled/stream_ุงูุทุฑูุฌ.p')