In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
df = pd.read_csv('../data/unprocessed/Tweets.csv')
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

Unnamed: 0,tweet_id,airline_sentiment_confidence,negativereason_confidence,retweet_count
count,14640.0,14640.0,10522.0,14640.0
mean,5.692184e+17,0.900169,0.638298,0.08265
std,779111200000000.0,0.16283,0.33044,0.745778
min,5.675883e+17,0.335,0.0,0.0
25%,5.685592e+17,0.6923,0.3606,0.0
50%,5.694779e+17,1.0,0.6706,0.0
75%,5.698905e+17,1.0,1.0,0.0
max,5.703106e+17,1.0,1.0,44.0


In [22]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [23]:
# some initial data integrity analysis

def analyse_columns(dataframe: pd.DataFrame):
    column_analysis = []

    for column in dataframe.columns:
        data_type = dataframe[column].dtype
        total_missing = dataframe[column].isnull().sum()
        missing_ratio = total_missing / len(dataframe) * 100
        unique_values = dataframe[column].nunique()

        column_info = {
            'Column': column,
            'Data Type': data_type,
            'Missing Values': total_missing,
            'Missing Ratio (%)': f'{missing_ratio:.2f}%',
            'Unique Values': unique_values
        }
        column_analysis.append(column_info)

    return pd.DataFrame(column_analysis)

analyse_columns(df)

Unnamed: 0,Column,Data Type,Missing Values,Missing Ratio (%),Unique Values
0,tweet_id,int64,0,0.00%,14485
1,airline_sentiment,object,0,0.00%,3
2,airline_sentiment_confidence,float64,0,0.00%,1023
3,negativereason,object,5462,37.31%,10
4,negativereason_confidence,float64,4118,28.13%,1410
5,airline,object,0,0.00%,6
6,airline_sentiment_gold,object,14600,99.73%,3
7,name,object,0,0.00%,7701
8,negativereason_gold,object,14608,99.78%,13
9,retweet_count,int64,0,0.00%,18


In [24]:
# airline_sentiment_gold and negativereason_gold are in the upper 99th percentile of missing values.
# tweet_coord is in the upper 93% percentile of missing values. 
# these columns are dropped since they contain little information and imputing values for the missing instances are unreasonable

df = df.drop(['airline_sentiment_gold', 'negativereason_gold', 'tweet_coord'], axis=1)