# Imports

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import json
import spacy

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

# Munging

In [2]:
def __process_tweets(filename):
        """
        This func takes the txt file
        created from scraping Twitter and
        outputs a dataframe
        """
        # initializes emptu list to hold txt data
        tweets_data = list()
        file = open(filename, "r")
        
        # loops over items in txt file 
        for i, item in enumerate(file):
            
            try:
                temp = json.loads(item)
                tweets_data.append(temp)
            except:
                #print(f"Item {i} n se alakori")
                continue
        
        file.close()
        
        return pd.DataFrame(tweets_data)

In [3]:
text = "../data/OkadaBan_{0}.txt"  # template string for filenames

In [4]:
# initializes empty dataframe to hold tweets
data = pd.DataFrame()
# loops over the 13 files, processsing and concat-ing them.
for i in range(1, 14):

    okada_ban_file = text.format(i)

    temp = __process_tweets(okada_ban_file)

    if data.empty:
        data = temp
    else:
        data = pd.concat([data, temp], ignore_index=True, sort=False)

print(data.shape)

(3253, 36)


data = pd.read_csv("OkadaBan.csv")

In [23]:
plentywaka = pd.read_csv("../data/plentywaka_data_1.csv")

In [6]:
data[["created_at", 'favorite_count', 'reply_count', 'retweet_count', 'quote_count']].describe()

Unnamed: 0,favorite_count,reply_count,retweet_count,quote_count
count,3253.0,3253.0,3253.0,3253.0
mean,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0


In [26]:
plentywaka = plentywaka.loc[plentywaka.truncated.eq(False)]

In [27]:
plentywaka.shape

(735, 333)

In [6]:
data.head()

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,entities,favorited,retweeted,filter_level,lang,timestamp_ms,extended_entities,possibly_sensitive,display_text_range,extended_tweet
0,Tue Feb 04 07:30:59 +0000 2020,1224596197270282242,1224596197270282242,RT @LSolarin1: Dear lagosians If you live arou...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",False,False,low,en,1580801459983,,,,
1,Tue Feb 04 07:31:01 +0000 2020,1224596205424062464,1224596205424062464,RT @akorive001: As we drill @jidesanwoolu on #...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,"{'hashtags': [{'text': 'LagosOkadaBan', 'indic...",False,False,low,en,1580801461927,,,,
2,Tue Feb 04 07:31:09 +0000 2020,1224596238529716227,1224596238529716227,"RT @Tife_fabunmi: Dear Lagosians, kindly stay ...","<a href=""http://twitter.com/download/android"" ...",False,,,,,...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",False,False,low,en,1580801469820,,,,
3,Tue Feb 04 07:31:33 +0000 2020,1224596337557213185,1224596337557213185,RT @LSolarin1: Dear lagosians If you live arou...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",False,False,low,en,1580801493430,,,,
4,Tue Feb 04 07:31:50 +0000 2020,1224596407149113345,1224596407149113345,RT @Amy_Siskind: So Klobuchar who was the only...,"<a href=""http://twitter.com/#!/download/ipad"" ...",False,,,,,...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",False,False,low,en,1580801510022,,,,


data.to_csv("OkadaBan.csv", index=False)  # writes the tweets dataframe to a csv file. 

In [7]:
data.loc[sample(range(len(data)), 1)]  # a sample of a tweet 

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,entities,favorited,retweeted,filter_level,lang,timestamp_ms,extended_entities,possibly_sensitive,display_text_range,extended_tweet
1631,Mon Feb 03 15:40:11 +0000 2020,1224356919265112064,1224356919265112064,RT @g_omijie: @Mr_JAGs @Riddwane @dr_oladeinde...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",False,False,low,en,1580744411661,,,,


### Comments

Upon inspection, different categories of tweets must be dealt with:

* Quoted tweeets: data.is_quoted_status == True | Check quoted_status dict for full details | Some may be truncated, others will not be. 
* Truncated tweets: data.truncated == True | Check extended_tweet dict for full tweet. 
* Non-retweeted tweets: data.retweeted_status.isna() == True | Full tweet is already in text column
* Retweeted tweets: data.retweeted_status.notna() == True | Check retweeted_status dict for full tweet | Some may be truncated, others will not be. 

## Quoted Tweets

In [8]:
data.is_quote_status.value_counts()  # check fraction of data that are quoted tweets 

False    2276
True      977
Name: is_quote_status, dtype: int64

In [9]:
quoted_data = data.loc[data.is_quote_status.eq(True)] # separate out quoted tweets so they can be treated differently 

In [10]:
data.drop(quoted_data.index, axis=0, inplace=True)  # drop quoted tweets that have been separated 

### Truncated Quoted Tweets

In [11]:
quoted_data.quoted_status.apply(lambda x: x["truncated"]).value_counts()  # check fraction of quoted tweets that are truncated 

True     784
False    193
Name: quoted_status, dtype: int64

In [12]:
quoted_data_truncated = quoted_data.loc[quoted_data.quoted_status.apply(lambda x: x["truncated"]).eq(True)]  # separate out truncated quoted tweets 

In [13]:
quoted_data_truncated.loc[:, "text"] = quoted_data_truncated.quoted_status.apply(lambda x: x["extended_tweet"]).apply(lambda x: x["full_text"])  #.tolist()  #.to_dict()

In [14]:
sample(quoted_data_truncated.text.tolist(), 1)  # sample of text  

['Dear Lagosians, kindly stay safe out there. Don’t accept every lift offer you get from any vehicle when trekking to your bus stop or office.\n\nIt’s better to trek few distances than to get missing.\nStay safe and observant.💙']

### Non-Truncated Quoted Tweets 

In [15]:
quoted_data.drop(quoted_data_truncated.index, axis=0, inplace=True)  # drop truncated quoted tweets that have been separated 

In [16]:
quoted_data.loc[:, "text"] = quoted_data.quoted_status.apply(lambda x: x["text"])  #.tolist()

In [17]:
sample(quoted_data.text.tolist(), 1)  # sample of text 

["What's the strangest way you found out YOU yourself you are poor?"]

## Truncated Tweets

In [18]:
data.truncated.value_counts()  # check fraction of data that is truncated tweets 

False    1846
True      430
Name: truncated, dtype: int64

In [19]:
truncated_data = data.loc[data.truncated.eq(True)]  # separate out truncated tweets so they can be treated differently 

In [20]:
truncated_data.loc[:, "text"] = truncated_data.extended_tweet.apply(lambda x: x["full_text"])  

In [21]:
sample(truncated_data.text.tolist(), 1)

['@StigEzeh @irediaaa howfar that year wey men go trek from Ekosodin go gtbank for main gate deposit H make man fit collect 2H for fidelity atm take chow one Santana without meat for purely 🤧😭😂']

## Non-retweeted Tweets

In [22]:
data.drop(truncated_data.index, axis=0, inplace=True)  # drop truncated tweets that have been separated

In [23]:
data.retweeted_status.isna().sum()  # number of tweets that are not retweets 

249

In [24]:
retweeted_status_isna_data = data.loc[data.retweeted_status.isna()]  # separate out tweets that are not retweets 

In [25]:
sample(retweeted_status_isna_data.text.tolist(), 1)

["I watched the first episode of star trek picard, it's AMAZING, the whole plot was so interesting, it was everything I expect from star trek"]

## Retweeted Tweets

In [26]:
data.drop(retweeted_status_isna_data.index, axis=0, inplace=True)  # drop non-retweeted tweets that have been separated

In [27]:
data.retweeted_status.apply(lambda x: x["truncated"]).value_counts()  # number of retweeted tweets that are truncated 

True     1446
False     151
Name: retweeted_status, dtype: int64

### Truncated Retweeted Tweets

In [28]:
retweeted_status_notna_truncated_true_data = data.loc[data.retweeted_status.apply(lambda x: x["truncated"]).eq(True)]  # separate out retweeted status that are truncated 

In [29]:
retweeted_status_notna_truncated_true_data.loc[:, "text"] = retweeted_status_notna_truncated_true_data.retweeted_status.apply(lambda x: x["extended_tweet"]).apply(lambda y: y["full_text"]) #.tolist()

In [30]:
sample(retweeted_status_notna_truncated_true_data.text.tolist(), 1) # sample of text 

["Please if you'll be trekking from ikeja along tomorrow to either opebi or ikeja GRA or anywhere inside ikeja kindly like this tweet. I'll DM you a link to a WhatsApp group so we can trek in groups\nThank you.❤️\n\n@_NerdyTalker @Gidi_Traffic @BisolaSanusi @iamdahmmie RT🙏"]

### Non-Truncated Retweeted Tweets

In [31]:
data.drop(retweeted_status_notna_truncated_true_data.index, axis=0, inplace=True)  # drop truncated retweeted status that have been separated out 

In [32]:
data.loc[:, "text"] = data.retweeted_status.apply(lambda x: x["text"]) # sample of text 

In [33]:
sample(data.text.tolist(), 1)

["Whether you prefer the view from Earth's highest peak or deepest crevices, find the inspiration for your next trek. https://t.co/0Lz6vkrI43"]

# Data Integration

In [44]:
useful_columns = ["created_at", "reply_count", "retweet_count", "favorite_count", "quote_count", "text", "user"]  # features that are useful for analysis 

In [45]:
datasets = [data, retweeted_status_notna_truncated_true_data, retweeted_status_isna_data, truncated_data, quoted_data, quoted_data_truncated]

In [47]:
cleaned_data = pd.concat([ds[useful_columns] for ds in datasets], sort=False)

In [67]:
cleaned_data.head()

Unnamed: 0,created_at,reply_count,retweet_count,favorite_count,quote_count,text,screen_name,followers_count
0,Tue Feb 04 07:30:59 +0000 2020,0,0,0,0,"Dear Lagosians, kindly stay safe out there. Do...",Hawt_Carter,524
1,Tue Feb 04 07:31:01 +0000 2020,0,0,0,0,"As we drill @jidesanwoolu on #LagosOkadaBan, a...",sarafa_j,470
2,Tue Feb 04 07:31:09 +0000 2020,0,0,0,0,"Dear Lagosians, kindly stay safe out there. Do...",OluwapelumiOhu,165
3,Tue Feb 04 07:31:33 +0000 2020,0,0,0,0,"Dear Lagosians, kindly stay safe out there. Do...",MsLena293,13
4,Tue Feb 04 07:31:50 +0000 2020,0,0,0,0,So Klobuchar who was the only candidate to tre...,goodmorninchirp,42


In [50]:
cleaned_data.sort_index(inplace=True)

In [54]:
cleaned_data.loc[:, "screen_name"] = cleaned_data.user.apply(lambda x: x["screen_name"])
cleaned_data.loc[:, "followers_count"] = cleaned_data.user.apply(lambda x: x["followers_count"]) 

In [55]:
cleaned_data.drop("user", axis=1, inplace=True)

In [61]:
star_trek_data = cleaned_data.loc[cleaned_data.text.str.contains("star trek", case=False, regex=True)] # noticed that Star Trek got into the tweet stream a lot because of my search items. 

In [64]:
cleaned_data = cleaned_data.drop(star_trek_data.index).reset_index(drop=True)

Int64Index([   0,    1,    2,    3,    4,    5,    7,    8,    9,   10,
            ...
            3243, 3244, 3245, 3246, 3247, 3248, 3249, 3250, 3251, 3252],
           dtype='int64', length=2855)

In [69]:
cleaned_data.to_csv("OkadaBan_Cleaned.csv", index=False)