### Predict if a tweet will go viral using a logistic regression model

#### 1. Explore the data
#### 2. Clean the data
#### 3. Create the prediction label
#### 4. Create the features
#### 5. Train the prediction model
#### 6. Test the model

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
filepath = 'C:\\Users\\Yeo Kheng Feng\\Desktop\\Github repositories\\random_tweets.json'
tweets = pd.read_json(filepath, lines=True)

#### Explore the data

In [4]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11099 entries, 0 to 11098
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   created_at                 11099 non-null  datetime64[ns, UTC]
 1   id                         11099 non-null  int64              
 2   id_str                     11099 non-null  int64              
 3   text                       11099 non-null  object             
 4   truncated                  11099 non-null  bool               
 5   entities                   11099 non-null  object             
 6   metadata                   11099 non-null  object             
 7   source                     11099 non-null  object             
 8   in_reply_to_status_id      1402 non-null   float64            
 9   in_reply_to_status_id_str  1402 non-null   float64            
 10  in_reply_to_user_id        1503 non-null   float64            
 11  in

In [5]:
tweets.head()

Unnamed: 0,created_at,id,id_str,text,truncated,entities,metadata,source,in_reply_to_status_id,in_reply_to_status_id_str,...,favorite_count,favorited,retweeted,lang,possibly_sensitive,quoted_status_id,quoted_status_id_str,extended_entities,quoted_status,withheld_in_countries
0,2018-07-31 13:34:40+00:00,1024287229525598210,1024287229525598208,RT @KWWLStormTrack7: We are more than a month ...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,0,False,False,en,,,,,,
1,2018-07-31 13:34:40+00:00,1024287229512953856,1024287229512953856,@hail_ee23 Thanks love its just the feeling of...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",1.024128e+18,1.024128e+18,...,0,False,False,en,,,,,,
2,2018-07-31 13:34:40+00:00,1024287229504569344,1024287229504569344,RT @TransMediaWatch: Pink News has more on the...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,,...,0,False,False,en,0.0,,,,,
3,2018-07-31 13:34:40+00:00,1024287229496029190,1024287229496029184,RT @realDonaldTrump: One of the reasons we nee...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,0,False,False,en,,,,,,
4,2018-07-31 13:34:40+00:00,1024287229492031490,1024287229492031488,RT @First5App: This hearing of His Word doesn’...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,,...,0,False,False,en,,,,,,


In [6]:
tweets[['text', 'retweet_count']].head(8)

Unnamed: 0,text,retweet_count
0,RT @KWWLStormTrack7: We are more than a month ...,3
1,@hail_ee23 Thanks love its just the feeling of...,0
2,RT @TransMediaWatch: Pink News has more on the...,5
3,RT @realDonaldTrump: One of the reasons we nee...,11106
4,RT @First5App: This hearing of His Word doesn’...,6
5,RT @attackerman: This is torture: “The staff t...,195
6,Did a demo of our Mobile Prototyping Kit at UX...,0
7,RT @itstae13: Stop getting rid of your pets be...,162420


In [8]:
#Print the first row of the 'user' column here, the 'user' column contains information about the user who tweeted.
tweets.loc[0, 'user']

{'id': 145388018,
 'id_str': '145388018',
 'name': 'Derek Wolkenhauer',
 'screen_name': 'derekw221',
 'location': 'Waterloo, Iowa',
 'description': '',
 'url': None,
 'entities': {'description': {'urls': []}},
 'protected': False,
 'followers_count': 215,
 'friends_count': 335,
 'listed_count': 2,
 'created_at': 'Tue May 18 21:30:10 +0000 2010',
 'favourites_count': 3419,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': False,
 'statuses_count': 4475,
 'lang': 'en',
 'contributors_enabled': False,
 'is_translator': False,
 'is_translation_enabled': False,
 'profile_background_color': '022330',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme15/bg.png',
 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme15/bg.png',
 'profile_background_tile': False,
 'profile_image_url': 'http://pbs.twimg.com/profile_images/995790590276243456/cgxRVviN_normal.jpg',
 'profile_image_url_https': 'https://pbs.twimg.com/profile

#### Data cleaning

In [None]:
tweets['text'].duplicated().sum() # To identfy total number of duplicated tweets under the 'text' column

In [None]:
tweets = tweets.drop_duplicates(subset=['text'], keep='first')
tweets['text'].duplicated().sum()

In [None]:
len(tweets) # Noticed number of rows reduced by 1,878 from 11,099 to 9,221

#### Create the prediction label (number of retweets)

In [None]:
# Prepare the y_label (whether tweet goes viral) based on number of retweets tweet has.
tweets['retweet_count'].sample(20)

In [None]:
# Understand the distribution of retweet_count (summary statistics)
tweets['retweet_count'].describe()

In [None]:
# Plot a graph to analyse the distribution of retweet count
x = tweets['retweet_count'].replace(0, 1)
plt.figure(figsize=(10,6))
sns.kdeplot(x, log_scale=2)

#### Create y_label
#### Set a retweet count of over 100 as a viral tweet (if viral set value to 1, else 0)

In [None]:
tweets['is_viral'] = np.where(tweets['retweet_count'] > 100, 1, 0)
tweets['is_viral'].value_counts()

In [None]:
percent = round(100 * tweets['is_viral'].sum() / len(tweets), 1)
print('Percent of tweets that is labeled viral: ' + str(percent) + '%')

#### Making features

In [None]:
tweets['tweet_length'] = tweets['text'].str.len()
tweets['followers_count'] = tweets['user'].str['followers_count']
tweets['friends_count'] = tweets['user'].str['friends_count']
tweets['links_count'] = tweets['text'].str.count('http')
tweets['words_count'] = tweets['text'].str.split().str.len()
tweets['average_word_length'] = tweets['text'].apply(lambda tweet: sum(len(word) for word in tweet.split()) / 
                                                      len(tweet.split()))
tweets['capital_letters_count'] = tweets['text'].apply(lambda tweet: sum(1 for char in tweet if char.isupper()))
tweets['!_count'] = tweets['text'].apply(lambda tweet: sum(1 for char in tweet if char == '!'))
tweets['#_count'] = tweets['text'].apply(lambda tweet: sum(1 for char in tweet if char == '#'))
tweets['@_count'] = tweets['text'].apply(lambda tweet: sum(1 for char in tweet if char == '@'))
tweets['if_RT'] = tweets['text'].apply(lambda tweet: 1 if 'RT' in tweet else 0)

list_of_features = ['tweet_length', 'followers_count', 'friends_count', 'links_count', 'words_count', 
                  'average_word_length', 'capital_letters_count', '!_count', '#_count', 
                 '@_count', 'if_RT', 'favorite_count']
list_of_features_and_labels = list_of_features + ['is_viral', 'retweet_count']

In [None]:
tweets[list_of_features_and_labels].head(10)

In [None]:
tweets[list_of_features_and_labels].describe()

In [None]:
# Plot a heatmap to observe the correlation between the features and labels
plt.figure(figsize=(14,12), dpi=200)
sns.heatmap(tweets[list_of_features_and_labels].corr(), annot=True)

#### Select features

In [None]:
# Select features that have stronger positive/negative correlation with 'is_viral' 
# That is, correlation greater than 0.1 or less than -0.1
selected_features = ['tweet_length', 'words_count', 'if_RT', 'links_count']
features = tweets[selected_features]
features.sample(10) # Double check features

In [None]:
features.describe() # Double check features' summary statistics

In [None]:
tweets['is_viral'].value_counts() # Double check label

#### Train model

In [None]:
# Split data into training (80%) and test sets (20%)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, tweets['is_viral'], test_size=0.2, random_state=1)

In [None]:
# Train logistic regression model
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

predictions = log_reg.predict(X_test)
round(accuracy_score(y_test, predictions) * 100, 2) # Print model accuracy

#### Trying out the model on sample tweets

In [None]:
# Function to extract features of a sample tweet to use to predict
def new_sample_prediction(sample):
    
    feature_list = []
    
    # Extract feature attributes from sample tweet and append to feature_list
    tweet_length = len(sample)
    feature_list.append(tweet_length)
    print("Tweet length: " + str(tweet_length))
    
    words_count = len(sample.split())
    feature_list.append(words_count)
    print("Words count: " + str(words_count))
    
    if 'RT' in sample:
        feature_list.append(1)
        print("Retweet: Yes")
    else:
        feature_list.append(0)
        print("Retweet: No")
    
    links_count = sample.count('http')
    feature_list.append(links_count)
    print("Links count: " + str(links_count))
    
    # Convert to array
    feature_array = np.array(feature_list)
    
    # Make prediction with log_reg predict() method and print result
    X_new = feature_array.reshape(1,-1)
    y_predict = log_reg.predict(X_new)
    if y_predict[0] == 1:
        print("Prediction: Tweet will go viral")
    else:
        print("Prediction: Tweet will not go viral")

In [None]:
# Paste in sample tweet for prediction
sample = 'Testing tweet RT http test test'

# Predict if sample tweet goes viral
new_sample_prediction(sample)

In [None]:
# Paste in sample tweet for prediction
sample = '''
Today, the Ingenuity Mars Helicopter achieved liftoff — becoming the first aircraft to fly on another planet. 
NASA proved once again that with relentless determination and the power of America’s best minds, anything is possible.
'''

# Predict if sample tweet goes viral
new_sample_prediction(sample)

In [None]:
# Paste in sample tweet for prediction
sample = '''
"The weirdest Olympics ever."
The event was supposed to symbolize the best of humanity. But even as the cauldron is lit, the challenges for organizers 
and officials will only intensify. @julesaly looks at what it took to get the #TokyoOlympics to the starting line
'''

# Predict if sample tweet goes viral
new_sample_prediction(sample)