# Import Modules

In [2]:
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))

from extract_dataframe import TweetDfExtractor
from extract_dataframe import read_json

_, tweet_list = read_json("../data/global_twitter_data.json")

# Data Preprocessing


## Read Given Data

In [3]:
tweets_df_extractor = TweetDfExtractor(tweet_list)
tweets_df = tweets_df_extractor.get_tweet_df()
tweets_df.head()

Unnamed: 0,created_at,source,full_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Sun Aug 07 22:31:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,15760,2,i_ameztoy,20497,2621,,[City],[i_ameztoy],
1,Sun Aug 07 22:31:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,6967,201,ZIisq,65,272,,"[China, Taiwan]",[IndoPac_Info],
2,Sun Aug 07 22:31:07 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","China even cut off communication, they don't a...",0.0,0.0,en,2166,0,Fin21Free,85,392,,[XiJinping],[ZelenskyyUa],Netherlands
3,Sun Aug 07 22:31:06 +0000 2022,"<a href=""http://twitter.com/download/android"" ...","Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,2166,0,Fin21Free,85,392,,[XiJinping],[],Netherlands
4,Sun Aug 07 22:31:04 +0000 2022,"<a href=""http://twitter.com/download/iphone"" r...","RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,17247,381,VizziniDolores,910,2608,,[],[ChinaUncensored],"Ayent, Schweiz"


In [9]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   created_at          0 non-null      object
 1   source              0 non-null      object
 2   full_text           0 non-null      object
 3   polarity            0 non-null      object
 4   subjectivity        0 non-null      object
 5   lang                0 non-null      object
 6   favorite_count      0 non-null      object
 7   retweet_count       0 non-null      object
 8   original_author     0 non-null      object
 9   followers_count     0 non-null      object
 10  friends_count       0 non-null      object
 11  possibly_sensitive  0 non-null      object
 12  hashtags            0 non-null      object
 13  user_mentions       0 non-null      object
 14  place               0 non-null      object
dtypes: object(15)
memory usage: 0.0+ bytes


## Removing Duplicate Rows

In [4]:
from clean_tweets_dataframe import Clean_Tweets
clean_tweets = Clean_Tweets(tweets_df)

Automation in Action...!!!


In [5]:
tweets_df.shape


(22000, 15)

In [None]:
clean_tweets.drop_duplicate(tweets_df)
tweets_df.shape

## Remove Non English Tweets

In [12]:
tweets_df = clean_tweets.remove_non_english_tweets(tweets_df)
tweets_df.shape

(22000, 15)

## Manage Missing values


In [13]:
tweets_df.isna().sum()


created_at                0
source                    0
full_text                 0
polarity                  0
subjectivity              0
lang                      0
favorite_count            0
retweet_count             0
original_author           0
followers_count           0
friends_count             0
possibly_sensitive    15809
hashtags                  0
user_mentions             0
place                     0
dtype: int64

## Manage Missing Data


In [18]:
tweets_df['possibly_sensitive'] = tweets_df['possibly_sensitive'].fillna("not_found")
tweets_df.isna().sum()


created_at            0
source                0
full_text             0
polarity              0
subjectivity          0
lang                  0
favorite_count        0
retweet_count         0
original_author       0
followers_count       0
friends_count         0
possibly_sensitive    0
hashtags              0
user_mentions         0
place                 0
dtype: int64

In [21]:
tweets_df[tweets_df['place'] == ''].shape


(9892, 15)

In [22]:
tweets_df['place'] = tweets_df['place'].fillna("not_known")
tweets_df.isna().sum()

created_at            0
source                0
full_text             0
polarity              0
subjectivity          0
lang                  0
favorite_count        0
retweet_count         0
original_author       0
followers_count       0
friends_count         0
possibly_sensitive    0
hashtags              0
user_mentions         0
place                 0
dtype: int64

## Remove Unwanted Characters


In [None]:
tweets_df = clean_tweets.remove_place_characters(tweets_df)
tweets_df.isna().sum()

## Convert to Datetime


In [27]:
tweets_df = clean_tweets.convert_to_datetime(tweets_df)
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22000 entries, 0 to 21999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   created_at          22000 non-null  datetime64[ns, UTC]
 1   source              22000 non-null  object             
 2   full_text           22000 non-null  object             
 3   polarity            22000 non-null  float64            
 4   subjectivity        22000 non-null  float64            
 5   lang                22000 non-null  object             
 6   favorite_count      22000 non-null  int64              
 7   retweet_count       22000 non-null  int64              
 8   original_author     22000 non-null  object             
 9   followers_count     22000 non-null  int64              
 10  friends_count       22000 non-null  int64              
 11  possibly_sensitive  22000 non-null  object             
 12  hashtags            22000 non-nu

## Extract device from source text


In [29]:
tweets_df['source'][:10]


0    <a href="http://twitter.com/download/android" ...
1    <a href="http://twitter.com/download/android" ...
2    <a href="http://twitter.com/download/android" ...
3    <a href="http://twitter.com/download/android" ...
4    <a href="http://twitter.com/download/iphone" r...
5    <a href="http://twitter.com/download/android" ...
6    <a href="http://twitter.com/download/android" ...
7    <a href="http://twitter.com/download/android" ...
8    <a href="http://twitter.com/download/android" ...
9    <a href="https://mobile.twitter.com" rel="nofo...
Name: source, dtype: object

## Save Preproccessed Data

In [None]:
tweets_df.to_csv("../data/global_twitter_preprocessed_data.csv", index=False)
