Importing Libraries

In [None]:
%matplotlib inline
import glob

import matplotlib.pyplot as plt 
import pandas as pd
from scipy.cluster import hierarchy
import seaborn as sns
from sklearn import cluster,preprocessing


In [None]:
df = pd.read_csv('tweet_activity_metrics___mharrison___20201101_20201201_en.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#convert time to datetime
df['time'] = pd.to_datetime(df['time'])

In [None]:
#convert - to NaN
df = df.replace('-', pd.np.nan)

In [None]:
df.head()

In [None]:
df.info()

### `Data Cleaning`

In [None]:
# create a function called sanitize that takes in a dataframe and returns a modified dataframe
def sanitize(df):
    return (
        df
        # Drop columns with all NaN values
        .dropna(axis=1, how="all")
        # Drop columns with all 0 or 0.0 values
        .drop(df.columns[df.apply(lambda col: col == 0.0).all()], axis=1)
        # Replace space in col names with underscore
        .rename(columns=lambda x: x.replace(" ", "_"))
    )

In [None]:
#Create a function called tweet_analysis that takes in a dataframe and returns a modified dataframe with new columns regarding tweet stats
def tweet_analysis(df):
    return df.assign(
        is_reply=df["Tweet_text"].str.startswith("@"),
        # Create new column to identify if the tweet was a quote
        is_quote=df["Tweet_text"].str.startswith('"'),
        # Create a new column char_length containing the length of the tweet
        length=df["Tweet_text"].str.len(),
        # Create a new column word_length containing the number of words in the tweet
        word_length=df["Tweet_text"].str.split().apply(len),
        # Create a new column to check if emojis were used in the tweet
        has_emoji=df["Tweet_text"].str.contains(r"[\U0001F600-\U0001F64F]"),
        # Create a new column containing the number of hashtags in the tweet
        hashtag_count=df["Tweet_text"].str.count("#"),
        # Create a new column containing the month of the tweet
        month=df["time"].dt.month,
        # Create a new column containing the day of the week of the tweet
        day_of_week=df["time"].dt.dayofweek,
        # Create a new column containing the hour of the tweet
        hour=df["time"].dt.hour,
        #Create a new column to check if any mentions were used in the tweet
        has_mentions=df["Tweet_text"].str.contains("@"),
        #Create a new column to check if any links were used in the tweet
        has_links=df["Tweet_text"].str.contains("http"),
        #Create a new column to check if any media was used in the tweet
        has_media=df["Tweet_text"].str.contains("pic.twitter.com"),
        #Create a new column and check if new lines were used in the tweet
        has_new_line=df["Tweet_text"].str.contains("\n"),
        #Create a new column and check number of new lines used in the tweet
        new_line_count=df["Tweet_text"].str.count("\n"),
        #Create a new column and count number of mentions used in the tweet
        mention_count=df["Tweet_text"].str.count("@"),
        
    )

In [None]:
df.columns

In [None]:
df.shape

In [None]:
# call the function sanitize on the dataframe df
df_sanitize = sanitize(df)
# call the function tweet_analysis on the dataframe df_sanitize
tweet_stats_df = tweet_analysis(df_sanitize)
tweet_stats_df.head()


In [None]:
df_sanitize.shape

In [None]:
tweet_stats_df.shape

In [None]:
tweet_stats_df.columns

In [None]:
df.info()