# May to Sept Traction Prediction

In [1]:
# Import libraries

import pandas as pd
import ast 
pd.set_option('display.max_colwidth', None)

In [2]:
data = pd.read_csv("may_june_jul_aug_sep_data_merged.csv")
data.shape

(87637, 10)

## 1. Data Preparation

In [3]:
def load_data(filename):
    df = pd.read_csv("may_june_jul_aug_sep_data_merged.csv") #.head(100)

    # Selecting Relevent Columms
    df = df[
        [
            "published",
            "headline",
            "summary",
            "link",
            "domain",
            "facebook_interactions",
            "date_extracted",
            "suggested_labels",
        ]
    ]

    # df = df.rename(columns={"Published":"published", "Headline":"headline", "Summary":"summary", "Link":"link", "Domain":"domain", "Facebook Interactions":"facebook_interactions"})

    # Converting published column to datetime
    df["published"] = pd.to_datetime(df["published"])
    df["date_extracted"] = pd.to_datetime(df["date_extracted"])

    # Extracting theme and index for each article
    df["suggested_labels"] = df["suggested_labels"].apply(lambda x: ast.literal_eval(x))
    df["article_theme"] = df["suggested_labels"].apply(lambda x: x[0].split(" > ")[0])
    df["article_index"] = df["suggested_labels"].apply(lambda x: x[0].split(" > ")[1])
    df = df.drop(columns=["suggested_labels"])

    # Sort articles
    df = df.sort_values(
        by=["headline", "published", "date_extracted"], ascending=[False, True, True]
    ).reset_index(drop=True)
    return df


def create_unique_df(df):
    df_unique = df.drop_duplicates(
        subset=["headline", "published"], keep="first"
    ).reset_index(drop=True)
    # Remove articles with indexes having less than 2 occurences
    df_unique = df_unique.groupby("article_index").filter(lambda x: len(x) > 1)

    return df_unique

In [4]:
df = load_data('may_june_jul_aug_sep_data_merged.csv')
df_unique = create_unique_df(df)

## 2. Feature Engineering

In [9]:
from textblob import TextBlob
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Salman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def engineer_features(df_unique):

    # Concat headline and summary
    df_unique['text'] = df_unique['headline'] + ' ' + df_unique['summary']

    # creature a feature for the number of hours since the article of the same article_index was published
    df_unique['hours_since_published'] = df_unique.sort_values(by=['published']).groupby(['article_index'])['published'].diff().dt.total_seconds().div(3600)
    df_unique['hours_since_published'] = df_unique['hours_since_published'].fillna(0)

    # create a feature for day of week published
    df_unique['day_of_week_published'] = df_unique['published'].dt.day_name().map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6,'Sunday': 7})

    # Create a featyre for hour of day published
    df_unique['hour_of_day_published'] = df_unique['published'].dt.hour

    # create a feature for number of words in headline
    df_unique['headline_word_count'] = df_unique['headline'].str.split().str.len()

    # create a feature for number of words in summary
    df_unique['summary_word_count'] = df_unique['summary'].str.split().str.len()

    # create a feature for number of words in text
    df_unique['text_word_count'] = df_unique['text'].str.split().str.len()

    # create a feature for number of non-stop words in headline and summary
    stop_words = set(stopwords.words('english'))
    df_unique['headline_non_stop_word_count'] = df_unique['headline'].apply(lambda x: len([word for word in str(x).split() if word.lower() not in stop_words]))
    df_unique['summary_non_stop_word_count'] = df_unique['summary'].apply(lambda x: len([word for word in str(x).split() if word.lower() not in stop_words]))
    df_unique['text_non_stop_word_count'] = df_unique['text'].apply(lambda x: len([word for word in str(x).split() if word.lower() not in stop_words]))

    # create a feature for number of unique words in headline and summary
    df_unique['headline_unique_word_count'] = df_unique['headline'].apply(lambda x: len(set([word for word in str(x).split()])))
    df_unique['summary_unique_word_count'] = df_unique['summary'].apply(lambda x: len(set([word for word in str(x).split()])))
    df_unique['text_unique_word_count'] = df_unique['text'].apply(lambda x: len(set([word for word in str(x).split()])))

    # creature a feature for the sentiment of the headline and summary
    df_unique['headline_sentiment'] = df_unique['headline'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    df_unique['summary_sentiment'] = df_unique['summary'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    df_unique['text_sentiment'] = df_unique['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

    # create a feature for the subjectivity of the headline and summary
    df_unique['headline_subjectivity'] = df_unique['headline'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    df_unique['summary_subjectivity'] = df_unique['summary'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    df_unique['text_subjectivity'] = df_unique['text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

    # Remove NA rows
    df_unique = df_unique.dropna()

    return df_unique

In [11]:
domain_map = {
    'asiaone': ['asiaone.com'],
    'businesstimes': ['businesstimes.com.sg'],
    'channelnewsasia': ['channelnewsasia.com', 'cnalifestyle.channelnewsasia.com', 'cnaluxury.channelnewsasia.com'],
    'goodyfeed': ['goodyfeed.com'],
    'mothership': ['babelfish.mothership.sg', 'mothership.sg', ],
    'mustsharenews': ['mustsharenews.com'],
    'ricemedia': ['ricemedia.co'],
    'straitstimes': ['straitstimes.com'],
    'stomp': ['stomp.straitstimes.com'],
    'thenewpaper': ['tnp.straitstimes.com'],
    'theonlinecitizen': ['theonlinecitizen.com'],
    'today': ['todayonline.com'],
    'yahoosingapore': ['sg.finance.yahoo.com', 'sg.news.yahoo.com', 'sg.style.yahoo.com'],
    # 'others': ['au.lifestyle.yahoo.com', 'au.news.yahoo.com','coconuts.co','malaysia.news.yahoo.com','public.flourish.studio', 'uk.news.yahoo.com']
}

reverse_domain_map = {}
for key, value in domain_map.items():
    for v in value:
        reverse_domain_map[v] = key

domain_traffic = {
    'asiaone': [4990000,1290000],
    'businesstimes': [1630000, 622688],
    'channelnewsasia': [14760000, 2330000],
    'goodyfeed': [891376, 461731],
    'mothership': [6770000, 1660000],
    'mustsharenews': [2010000, 823526],
    'ricemedia': [148791, 86394],
    'straitstimes': [12310000, 2230000],
    'stomp': [1270000, 427113],
    'theindependent': [2370000, 683818],
    'thenewpaper': [707314, 350734],
    'theonlinecitizen': [657013, 233237],
    'today': [3660000, 1170000],
    'yahoosingapore': [5010000, 561434],
}

traffic_table = pd.DataFrame.from_dict(domain_traffic, orient='index', columns=['mean_monthly_visits', 'mean_monthly_unique_visitors']).reset_index().rename(columns={'index':'domain_group'})

In [12]:
def filter_df(df):
    df = df[df["domain"].isin([i for a in list(domain_map.values()) for i in a])]
    return df

def merge_traffic_table(df_unique):
    df_unique['domain_group'] = df_unique['domain'].map(reverse_domain_map)
    df_unique = df_unique.merge(traffic_table, on='domain_group', how='left')
    return df_unique

In [13]:
df = filter_df(df)

df_unique = engineer_features(df_unique)

df_unfiltererd = df_unique.copy()

df_unique = filter_df(df_unique)
df_unique = merge_traffic_table(df_unique)

In [14]:
df_unique.columns

Index(['published', 'headline', 'summary', 'link', 'domain',
       'facebook_interactions', 'date_extracted', 'article_theme',
       'article_index', 'text', 'hours_since_published',
       'day_of_week_published', 'hour_of_day_published', 'headline_word_count',
       'summary_word_count', 'text_word_count', 'headline_non_stop_word_count',
       'summary_non_stop_word_count', 'text_non_stop_word_count',
       'headline_unique_word_count', 'summary_unique_word_count',
       'text_unique_word_count', 'headline_sentiment', 'summary_sentiment',
       'text_sentiment', 'headline_subjectivity', 'summary_subjectivity',
       'text_subjectivity', 'domain_group', 'mean_monthly_visits',
       'mean_monthly_unique_visitors'],
      dtype='object')

## 3. Data Labelling

In [15]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
GRD_THRESHOLD_UPPER = 1
GRD_THRESHOLD_LOWER = -1
VAL_ARR_THRESHOLD_UPPER = 1
VAL_ARR_THRESHOLD_LOWER = -1

In [17]:
def cal_article_mean_and_gradient(val_array):
    val_array = np.asarray(val_array)
    mean = np.mean(val_array)
    val_array = np.nan_to_num(val_array)
    model = LinearRegression()
    model.fit(np.arange(len(val_array)).reshape(-1, 1), val_array)
    gradient = model.coef_[0]
    return mean, gradient

def agg_mean_and_gradient(df):
    df_agg = df.groupby('link')['facebook_interactions'].agg(cal_article_mean_and_gradient).reset_index()
    return df_agg

def merge_labels(df_unique, df_agg):
    df_labelled = df_unique.merge(df_agg, on='link', how='left')
    df_labelled = df_labelled.rename(columns={'facebook_interactions_x': 'facebook_interactions', 'facebook_interactions_y': 'mean_and_gradient'})
    df_labelled['mean'] = df_labelled['mean_and_gradient'].apply(lambda x: x[0])
    df_labelled['gradient'] = df_labelled['mean_and_gradient'].apply(lambda x: x[1])
    df_labelled.drop(columns=['mean_and_gradient'], inplace=True)
    df_labelled['gradient'] = df_labelled['gradient'].apply(lambda x: 0 if x < 0 else x)
    return df_labelled

def normalise_mean_and_gradient(df_labelled):
    df_labelled['mean_s_scaled'] = StandardScaler().fit_transform(df_labelled[['mean']])
    df_labelled['gradient_s_scaled'] = StandardScaler().fit_transform(df_labelled[['gradient']])

    df_labelled['mean_mm_scaled'] = MinMaxScaler().fit_transform(df_labelled[['mean']])
    df_labelled['gradient_mm_scaled'] = MinMaxScaler().fit_transform(df_labelled[['gradient']])

    df_labelled['mean_r_scaled'] = RobustScaler().fit_transform(df_labelled[['mean']])
    df_labelled['gradient_r_scaled'] = RobustScaler().fit_transform(df_labelled[['gradient']])

    df_labelled['mean_pt_scaled'] = PowerTransformer().fit_transform(df_labelled[['mean']])
    df_labelled['gradient_pt_scaled'] = PowerTransformer().fit_transform(df_labelled[['gradient']])

    df_labelled['mean_qt_scaled'] = QuantileTransformer().fit_transform(df_labelled[['mean']])
    df_labelled['gradient_qt_scaled'] = QuantileTransformer().fit_transform(df_labelled[['gradient']])

    df_labelled['mean_log'] = np.log(df_labelled['mean'])
    df_labelled['gradient_log'] = np.log(df_labelled['gradient'])
    return df_labelled

def plot_gradient(df_labelled):
    fig, ax = plt.subplots(1, 7, figsize=(35, 5))
    sns.histplot(data=df_labelled, x='gradient', bins=100, kde=True, ax=ax[0])
    sns.histplot(data=df_labelled, x='gradient_s_scaled', bins=100, kde=True, ax=ax[1])
    sns.histplot(data=df_labelled, x='gradient_mm_scaled', bins=100, kde=True, ax=ax[2])
    sns.histplot(data=df_labelled, x='gradient_r_scaled', bins=100, kde=True, ax=ax[3])
    sns.histplot(data=df_labelled, x='gradient_pt_scaled', bins=100, kde=True, ax=ax[4])
    sns.histplot(data=df_labelled, x='gradient_qt_scaled', bins=100, kde=True, ax=ax[5])
    sns.histplot(data=df_labelled, x='gradient_log', bins=100, kde=True, ax=ax[6])
    plt.show()

def plot_mean(df_labelled):
    fig, ax = plt.subplots(1, 7, figsize=(35,5))
    sns.histplot(data=df_labelled, x='mean', bins=100, kde=True, ax=ax[0])
    sns.histplot(data=df_labelled, x='mean_s_scaled', bins=100, kde=True, ax=ax[1])
    sns.histplot(data=df_labelled, x='mean_mm_scaled', bins=100, kde=True, ax=ax[2])
    sns.histplot(data=df_labelled, x='mean_r_scaled', bins=100, kde=True, ax=ax[3])
    sns.histplot(data=df_labelled, x='mean_pt_scaled', bins=100, kde=True, ax=ax[4])
    sns.histplot(data=df_labelled, x='mean_qt_scaled', bins=100, kde=True, ax=ax[5])
    sns.histplot(data=df_labelled, x='mean_log', bins=100, kde=True, ax=ax[6])
    plt.show()

def get_label(row, colums):
    gradient_index = colums.index('gradient_pt_scaled')
    mean_index = colums.index('mean_pt_scaled')

    if row[gradient_index] > GRD_THRESHOLD_UPPER or row[mean_index] > VAL_ARR_THRESHOLD_UPPER:
        return "trending"
    
    # elif row[gradient_index] < GRD_THRESHOLD_LOWER or row[mean_index] < VAL_ARR_THRESHOLD_LOWER:
    #     return "diminishing"
    
    else:
        return "not_trending"
    
def map_labels_to_numertical(df_labelled):
    label_mapper = {'trending': 1, 'not_trending': 0}
    df_labelled['label'] = df_labelled['label'].map(label_mapper)
    return df_labelled
    
def plot_labels(df_labelled):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df_labelled, x='label')
    plt.title('Distribution of Target Variable')
    plt.xlabel('Target')
    plt.ylabel('Count')
    plt.show()

def merge_labels_to_main(df_labelled, df):
    df_combined = df.merge(df_labelled[['link', 'label']], on='link', how='left')
    return df_combined

def get_article_activity_by_day(df_labelled, df):
    df_combined = df.merge(df_labelled[['link', 'label', 'mean_monthly_visits', 'hours_since_published', 'text_sentiment', 'text_subjectivity']], on='link', how='left')
    unique_articles = df_combined['link'].unique()
    activity_df = pd.DataFrame()

    for article in unique_articles:
        df_slice = df_combined[df_combined['link'] == article].reset_index(drop=True)
        df_slice = df_slice.reset_index().rename(columns={'index': 'day'})
        df_slice['day'] = df_slice['day'] + 1
        activity_df = activity_df.append(df_slice)

    return activity_df
    
def plot_activities(activity_df):
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=activity_df, x='day', y='facebook_interactions', hue='label')
    plt.title('Facebook Interactions over Days')
    plt.xlabel('Day')
    plt.ylabel('Facebook Interactions')
    plt.show()

In [18]:
df_agg = agg_mean_and_gradient(df)
df_labelled = merge_labels(df_unique, df_agg)
df_labelled = normalise_mean_and_gradient(df_labelled)
df_labelled['label'] = df_labelled.apply(lambda x: get_label(x, list(df_labelled.columns)), axis=1)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [19]:
df_labelled['label'].value_counts()

not_trending    11602
trending         4600
Name: label, dtype: int64

In [None]:
activity_df = get_article_activity_by_day(df_labelled, df)

  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity_df = activity_df.append(df_slice)
  activity

In [None]:
activity_df['label'].value_counts()

In [None]:
activity_df.columns

In [None]:
activity_df.to_csv("activity_df_may_sep.csv")

## `hours_df` - Readjust interactions using interpolation

In [None]:
def get_hours_active(activity_df):

    unique_articles = activity_df['link'].unique()
    all_hours_active = []
    all_delta_interactions = []
    hours_df = pd.DataFrame()
    
    for article in unique_articles:
        temp = activity_df.loc[activity_df['link']==article]
        
        day1hour = (24 - (temp['published'].dt.hour + temp['published'].dt.minute/60))[0]
        total_hour_active = day1hour
        hours_active = [day1hour]   
        
        for i in range(temp.shape[0]-1):
            total_hour_active = total_hour_active + 24
            hours_active.append(total_hour_active)
        
        all_hours_active.append(hours_active)
        temp['hours_active'] = hours_active
        temp['delta_facebook_interactions'] = temp['facebook_interactions'] - temp['facebook_interactions'].shift(periods=1, fill_value=0)
        temp['total_day_hours'] = temp["day"] * 24
        temp['missed_hours'] = temp['total_day_hours'] - temp['hours_active'] 
        temp['interpolated_facebook_interactions'] = temp['facebook_interactions'] + (temp['delta_facebook_interactions'].shift(periods=-1, fill_value=0) * (temp['missed_hours']/24))

        
        hours_df = hours_df.append(temp)
    
    return hours_df

In [None]:
hours_df = get_hours_active(activity_df)

In [None]:
hours_df['normalised_interpolated_facebook_interactions'] = hours_df['interpolated_facebook_interactions']/1000

In [None]:
articles_6_days = hours_df.loc[hours_df['day'] >= 6]['link'].unique()
hours_df = hours_df.loc[hours_df['link'].isin(articles_6_days)]

In [None]:
activity_df['published'] = pd.to_datetime(activity_df['published'])

unique_articles = activity_df['link'].unique()
all_hours_active = []
all_delta_interactions = []
hours_df = pd.DataFrame()

for article in unique_articles:
    temp = activity_df.loc[activity_df['link']==article]
    
    day1hour = (24 - (temp['published'].dt.hour + temp['published'].dt.minute/60))[0]
    total_hour_active = day1hour
    hours_active = [day1hour]   
    
    for i in range(temp.shape[0]-1):
        total_hour_active = total_hour_active + 24
        hours_active.append(total_hour_active)
    
    all_hours_active.append(hours_active)
    temp['hours_active'] = hours_active
    temp['delta_facebook_interactions'] = temp['facebook_interactions'] - temp['facebook_interactions'].shift(periods=1, fill_value=0)
    temp['total_day_hours'] = temp["day"] * 24
    temp['missed_hours'] = temp['total_day_hours'] - temp['hours_active'] 
    temp['interpolated_facebook_interactions'] = temp['facebook_interactions'] + (temp['delta_facebook_interactions'].shift(periods=-1, fill_value=0) * (temp['missed_hours']/24))

    
    hours_df = hours_df.append(temp)

In [None]:
hours_df.to_csv("hours_may_sep.csv")