In [1]:
#!pip install tweet-preprocessor

In [528]:
import preprocessor as p
import pandas as pd
import json
import re

In [529]:
pd.options.display.max_colwidth = 75

In [703]:
pwd

'/home/jovyan/capstone-35/gulf_twitter_pickled'

In [704]:
cd ../gulf_twitter_raw/gulf_timelines/

/home/jovyan/capstone-35/gulf_twitter_raw/gulf_timelines


In [705]:
ls

user_timeline_Basbosah.jsonl


### Turn query documents (tweets) into generators

In [124]:
def load_tweets(file):
    with open(file, 'r') as f:
        tweets = (json.loads(line) for line in f.readlines())
    return tweets

In [706]:
query_gen = load_tweets('d')

### Turn query generators into dataframes

In [707]:
def tweet_to_df(tweets):
    data = {'text': [], 'screen_name': [], 'geo': [],
       }
    
    for t in tweets:
    
        data['text'].append(t['text'])
        data['screen_name'].append(t['user']['screen_name'])
        data['geo'].append(t['user']['location'])
    
    return pd.DataFrame(data)

In [708]:
query_df = tweet_to_df(query_gen)

In [709]:
query_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3183 entries, 0 to 3182
Data columns (total 3 columns):
geo            3183 non-null object
screen_name    3183 non-null object
text           3183 non-null object
dtypes: object(3)
memory usage: 74.7+ KB


### Clean tweets (removes emojis, links, special characters, and ASCII chars/digits from tweet text)

Using regular expressions such as `[0-9A-Za-z:&!/._?)ุ'+-=โ]` can filter out much of the unwanted data and retain Arabic characters but still seems to also retain a more recently released generation of emojis. Try updating tweet-preprocessor to version 0.5.0 or using another tweet processing package. 

Different scenarios will require different regex combinations. For example, the below approach wouldn't be of much use if you're building English dialect classifiers.

In [710]:
def cleaner(message):
    message = re.sub("([0-9A-Za-z:&!/._?)ุ'+-=โ])", " ", message)
    return message

In [711]:
cleaned_text = []

for tweet in query_df['text']:
    cleaned_text.append(p.clean(cleaner(tweet)))

query_df['cleaned_text']=cleaned_text

In [712]:
cleaned_name = []

for tweet in query_df['screen_name']:
    cleaned_name.append(p.clean(tweet))

query_df['cleaned_name']=cleaned_name

In [713]:
cleaned_geo = []

for tweet in query_df.geo:
    cleaned_geo.append(p.clean(str(tweet)))

query_df['cleaned_geo']=cleaned_geo


In [714]:
query_df.sample(4)

Unnamed: 0,geo,screen_name,text,cleaned_text,cleaned_name,cleaned_geo
700,East - ููููุชู,Basbosah,@albassamah Whose this ๐๐๐,,Basbosah,East - ููููุชู
860,East - ููููุชู,Basbosah,Oh Winter what surprises do you carry for us ๐คits getting warmer ๐,๐ค,Basbosah,East - ููููุชู
3130,East - ููููุชู,Basbosah,ุงููู ูุจููุง ุนูุช ุงูููุณ ุชุจุบูู โ๐ผ \nูุงููู ูุจูู ุนูุง ุงูุจุฎุช ูุง ูุฌูุจู ๐๐ผ,ุงููู ูุจููุง ุนูุช ุงูููุณ ุชุจุบูู ูุงููู ูุจูู ุนูุง ุงูุจุฎุช ูุง ูุฌูุจู,Basbosah,East - ููููุชู
3065,East - ููููุชู,Basbosah,@nawal66 ุดููู ุจูุฑุฏ ุนููู : ุฅูุช ุน ุฅูุด ูุณุชุนุฌูู ๐,ุดููู ุจูุฑุฏ ุนููู ุฅูุช ุน ุฅูุด ูุณุชุนุฌูู,Basbosah,East - ููููุชู


### Remove duplicated tweets

In [715]:
query_df['cleaned_text'].duplicated().sample(5)

681     False
1279    False
570     False
2661    False
1256    False
Name: cleaned_text, dtype: bool

In [716]:
query_cleaned = query_df.drop_duplicates(['cleaned_text'], keep=False)
query_cleaned = query_cleaned.drop(['screen_name', 'text', 'geo'], axis=1)

In [717]:
query_cleaned.sample(5)

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
642,ูู ุงูุฎุจู ุงููู ุจูููุจ ูููู ุจูููุฑูุณ ูุงุฑุจูููู,Basbosah,East - ููููุชู
2363,ู ุน ูุงุงุงุงุฑ ุงูุชุธุฑ ุงูุญููู ุงูุงุฎูุฑู ูู ููู ุงูู ุซุฑููุฒ ุชุฎูุต ุฏุงูููููุฏ ูุง ุงููู ...,Basbosah,East - ููููุชู
2325,ูุณุงุนู ุฃูุชุธุฑ ูุดู ุงููู ูููุฒ ุงุช ูุงุฏูุช ูุงุนูุฏูู ูุงุนูุฏ ุฌุฏู ูุงุฌุฏ ูููุฑ ุงูููู ุณุฎูู,Basbosah,East - ููููุชู
2305,ูุญูุฏ ุงููู ุฃู ูููู ููุฐู ุงูุจูุงุฏ ุฎุฏูุฉ ุถููู ุงูุฑุญููุ ููุญูุฏู ุฃู ุจูุบูุง ุนูุฏ ุงูุฃ...,Basbosah,East - ููููุชู
2234,ูุจููุง ุชุฑุงุจ ุงููุทู ูุฐุง ุงูุฒูุงู ุฃุฎุถุฑ ูุงุดูุฑูุง ูุฌุฒู ุงููุนู ูุฒูุฏูู ุฃูุซุฑ ูุฑุงูุฉู ...,Basbosah,East - ููููุชู


In [718]:
query_cleaned.fillna('None')

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
0,ูุฏูุช ุงูู ุงูุฅูุงุฑุงุช ๐ค๐ค๐ค,Basbosah,East - ููููุชู
1,ุชุฎููู ูููุงุฑ ูุณูุฉ ูุชุญุจููู ูุงุญุฏ ูุงุฎุชู ุงูู ุญุงุฌุฉ ุญุจู ุนุดุฑูู,Basbosah,East - ููููุชู
2,ุงุณุชูุจู ุจุดุงุฑ ุงูุฃุณุฏ ุงูููู ุงูุฑุฆูุณ ุงูุฑูุณู ุจูุชู ุจุฏุง ูู ุดูุก ุทุจูุนู ูููู ุฑูุงุฏ ู...,Basbosah,East - ููููุชู
3,ุจุฑุฑุฑุฑุฑุฏ,Basbosah,East - ููููุชู
4,๐ค๐ค๐ค๐ค,Basbosah,East - ููููุชู
5,ูู ุงูุณุนูุฏูู ุฒูุงู ุญูุง ูุดูู ุจุนุถ ุงูุฏูู ุงูุนุฑุจูู ูุชุญุฑุฑูู ุจุฒูุฏ ููู ููุธุฑูู ููู...,Basbosah,East - ููููุชู
7,ููููููููู ุงูุฒูุช ๐คฃ๐คฃ๐คฃุฑูุญ ุชุฒุญูู ููู,Basbosah,East - ููููุชู
8,ุดูููุง ุงูููุนูู ุฌุงูุฒุชู ูู ููู ูุณุจุจ ููุง ุงูุญุฒู ููุงุนุฏ ูุชุบุฒู ุจุฏููุนูุง ุงููู ูุฒู...,Basbosah,East - ููููุชู
9,ูุงุฑูุชู ููุจู ุงููุฏุงุก ูุงููุณู ููุงููุง,Basbosah,East - ููููุชู
10,ูููู ุงุจููุง ุนูุฏู ุฒูุงููุฑ ููุณู ุงูู ุฒูุฌูุง ุทูุจ ูู ุนุงุฏู ุนูุฏูุง ๐ค,Basbosah,East - ููููุชู


In [719]:
query_cleaned.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2705 entries, 0 to 3182
Data columns (total 3 columns):
cleaned_text    2705 non-null object
cleaned_name    2705 non-null object
cleaned_geo     2705 non-null object
dtypes: object(3)
memory usage: 84.5+ KB


### Consolidate different geo tags

In [720]:
locations = {'Saudi Arabia': ['Kingdom of Saudi Arabia', 'MAKKAH', 'saudi arabia', 'jeddah', 'ููุฉ ุงูููุฑูุฉ', 'ุงูููููุฉ ุงูุนุฑุจูุฉ', 'Jeddah', 'Riyadh', 'ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'jeddah , saudi arabia', 'ุงูุฑูุงุถ', 'ุขููุตูู ุ ุจุฑูุฏู', 'ุฌุฏุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงููุฏููุฉ ุงููููุฑุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'Riyadh, Kingdom of Saudi Arabia', 'makkah - saudi arabia', 'ุฌุฏู', 'Dammam', 'Dammam, Eastern', 'Al Khobar, Kingdom of Saudi Arabia', 'ุงูุฎุจุฑ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏู', 'ุงูุณุนูุฏูุฉ', 'ุงููุตูู', 'ุชุจูู, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุฌุฏุฉ', 'ุงูุฎูุฌู, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ'],
             'Kuwait': ['ุงููููุช', 'Alkuwait', 'kuwait', 'KUWAIT', 'Kuwait , Salwa', 'Qortuba, Kuwait', 'ุฏููุฉ ุงููููุช', 'ุงูุนุงุตูู, ุฏููุฉ ุงููููุช', 'The Capital, Kuwait', 'Al Salam, Kuwait', 'Salwa, Kuwait', 'ูููุจ ุงูุดุฑู - Kuwait', 'ุฏููุฉ ุงููููุช-ุงูููุญุงุก', 'Al Adailiya, Kuwait'],
             'Egypt': ['Domiat, Egypt', 'Damitta', 'Alexandria, Egypt', 'Damitta-Egypt', 'Giza, Egypt', 'New damieta city', 'ูุตุฑ', 'ุงููุงูุฑุฉ, ูุตุฑ', 'El Behera, Egypt', 'ุงูุฌูุฒุฉ, ูุตุฑ', 'Elmansoura', 'El Menia, Egypt', 'El Qaliobia, Egypt', 'ุงูุจุญูุฑุฉ, ูุตุฑ', 'Domyat elgadeda', 'El Sharkia, Egypt', 'sharkia_fakous', 'Cairo, Egypt', 'Alexandria,Egypt', 'ELsharkia , Egypt', 'ุดูุงู ุณููุงุก, ูุตุฑ', 'ููุฑ ุงูุดูุฎ, ูุตุฑ', 'Port Said, Egypt', 'Beni Suef, Egypt', 'ุงูููููุจูุฉ, ูุตุฑ', 'ุจููุง', 'Aswan', 'Beni Suef, Egypt', 'Zagazig,Egypt', 'Ismalia, Egypt', 'Kafr El Shikh, Egypt', 'ุงููุงูุฑุฉ, ูุตุฑ', 'ุงูุจุญูุฑุฉ, ูุตุฑ', 'live in q8 .. bas masrya', 'ุงููุงูุฑุฉ, ูุตุฑ', 'Domiat, Egypt', 'Alexandria, Egypt', 'Cairo, Egypt', 'El Menia', 'ุฃุณููุฏุฑูุฉ', 'Alexandria', 'tanta', 'giza,egypt', 'Giza Governorate, Egypt', 'Mansoura', 'Mansura,Egypt','ุงููููููุฉ, ูุตุฑ', 'ูุตุฑ ุงูุงุณููุฏุฑูุฉ','El Sharkia, Egypt', 'El Daqahlia, Egypt', 'Cairo -Egypt', 'Cairo , Egypt', 'Aswan, Egypt', 'Mansurah, Egypt', 'ุงูุฃุณููุฏุฑูุฉ, ูุตุฑ', 'alexandria', 'ุงูุฃูุตุฑ, ูุตุฑ', 'Cairo', 'El Gharbia, Egypt', 'egypt', 'Mansoura,Egypt', 'Beni Suef, Egypt', 'Mansoura', 'El Daqahlia, Egypt', 'El Monofia, Egypt'],
             'UAE': ['United Arab Emirates', 'Dubai, United Arab Emirates', 'Dubai'],
             'Oman': ['ุณูุทูุฉ ุนูุงู', 'oman']
            }

In [693]:
'MAKKAH' in list(locations.values())[0]

True

In [694]:
'MAKKAH' in list(locations.values())[1]

False

In [695]:
locations.values()

dict_values([['Kingdom of Saudi Arabia', 'MAKKAH', 'saudi arabia', 'jeddah', 'ููุฉ ุงูููุฑูุฉ', 'ุงูููููุฉ ุงูุนุฑุจูุฉ', 'Jeddah', 'Riyadh', 'ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงูุฑูุงุถ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'jeddah , saudi arabia', 'ุงูุฑูุงุถ', 'ุขููุตูู ุ ุจุฑูุฏู', 'ุฌุฏุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุงููุฏููุฉ ุงููููุฑุฉ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'Riyadh, Kingdom of Saudi Arabia', 'makkah - saudi arabia', 'ุฌุฏู', 'Dammam', 'Dammam, Eastern', 'Al Khobar, Kingdom of Saudi Arabia', 'ุงูุฎุจุฑ, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏู', 'ุงูุณุนูุฏูุฉ', 'ุงููุตูู', 'ุชุจูู, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ', 'ุฌุฏุฉ', 'ุงูุฎูุฌู, ุงูููููุฉ ุงูุนุฑุจูุฉ ุงูุณุนูุฏูุฉ'], ['ุงููููุช', 'Alkuwait', 'kuwait', 'KUWAIT', 'Kuwait , Salwa', 'Qortuba, Kuwait', 'ุฏ

In [721]:
for word in query_cleaned['cleaned_geo'].str.split('_'):
    if word[0] in list(locations.values())[0]:
        query_cleaned['cleaned_geo'].replace(word, 'Saudi Arabia', inplace=True)
    elif word[0] in list(locations.values())[1]:
            query_cleaned['cleaned_geo'].replace(word, 'Kuwait', inplace=True)
    elif word[0] in list(locations.values())[2]:
            query_cleaned['cleaned_geo'].replace(word, 'Egypt', inplace=True)
    elif word[0] in list(locations.values())[3]:
            query_cleaned['cleaned_geo'].replace(word, 'UAE', inplace=True)
    else:
        if word[0] in list(locations.values())[4]:
            query_cleaned['cleaned_geo'].replace(word, 'Oman', inplace=True)

In [722]:
query_cleaned.sample(5)

Unnamed: 0,cleaned_text,cleaned_name,cleaned_geo
2695,ุงููุณุงุทุงุช ุงูุญููููุฉ ูุง ุชุนูู ูุงูุชุตุฑูุญุงุช ุงููุฑุงุฏ ูููุง ุจุทููุฉ ูุงูุชุดุงุฑ ูุง ูููุฉ ููุง,Basbosah,East - ููููุชู
886,ุงุณุชูุงูุฉ ุงูุญุฑูุฑู ุฃููู ุชุญุฐูุฑ ููุฌู ูุญุฒุจ ุงูููโฆ ุขู ุงูุฃูุงู ูููุฎุฏูุนูู ุฃู ูููุฒู...,Basbosah,East - ููููุชู
882,ููุงุฐุง ุงูุญุจ ูุณุชูุฑ ูู ุงูุฃููุงู ูุฃู ุงููููู ุณุงุนุชูู ุจุณ,Basbosah,East - ููููุชู
973,ููู ููุงุจู ูุฐุง ูุฏูุน ูุตุงุญุจ ุงูุณุฌู ูุจูุบ ูุงูู ููุทูุน ุดูุฑู ุนู ูู ุตูุฏููุฉ ูุชู ุงู...,Basbosah,East - ููููุชู
24,ูู ูุงุฐู ุจุงูุจุญุฑูู,Basbosah,East - ููููุชู


### Pickle to conduct EDA in seperate notebook

In [723]:
pwd

'/home/jovyan/capstone-35/gulf_twitter_raw/gulf_timelines'

In [724]:
cd ../../gulf_twitter_pickled/

/home/jovyan/capstone-35/gulf_twitter_pickled


In [725]:
query_cleaned.to_pickle('../gulf_twitter_pickled/user_timeline_Basbosah.p')