## Attribute Description :

#### Input variables : 
* url: URL of the article (non-predictive)
* timedelta: Days between the article publication and the dataset acquisition (non-predictive)
* n_tokens_title: Number of words in the title
* n_tokens_content: Number of words in the content
* n_unique_tokens: Rate of unique words in the content
* n_non_stop_words: Rate of non-stop words in the content
* n_non_stop_unique_tokens: Rate of unique non-stop words in the content
* num_hrefs: Number of links
* num_self_hrefs: Number of links to other articles published by Mashable
* num_imgs: Number of images
* num_videos: Number of videos
* average_token_length: Average length of the words in the content
* num_keywords: Number of keywords in the metadata
* data_channel_is_lifestyle: Is data channel 'Lifestyle'?
* data_channel_is_entertainment: Is data channel 'Entertainment'?
* data_channel_is_bus: Is data channel 'Business'?
* data_channel_is_socmed: Is data channel 'Social Media'?
* data_channel_is_tech: Is data channel 'Tech'?
* data_channel_is_world: Is data channel 'World'?
* kw_min_min: Worst keyword (min. shares)
* kw_max_min: Worst keyword (max. shares)
* kw_avg_min: Worst keyword (avg. shares)
* kw_min_max: Best keyword (min. shares)
* kw_max_max: Best keyword (max. shares)
* kw_avg_max: Best keyword (avg. shares)
* kw_min_avg: Avg. keyword (min. shares)
* kw_max_avg: Avg. keyword (max. shares)
* kw_avg_avg: Avg. keyword (avg. shares)
* self_reference_min_shares: Min. shares of referenced articles in Mashable
* self_reference_max_shares: Max. shares of referenced articles in Mashable
* self_reference_avg_sharess: Avg. shares of referenced articles in Mashable
* weekday_is_monday: Was the article published on a Monday?
* weekday_is_tuesday: Was the article published on a Tuesday?
* weekday_is_wednesday: Was the article published on a Wednesday?
* weekday_is_thursday: Was the article published on a Thursday?
* weekday_is_friday: Was the article published on a Friday?
* weekday_is_saturday: Was the article published on a Saturday?
* weekday_is_sunday: Was the article published on a Sunday?
* is_weekend: Was the article published on the weekend?
* LDA_00: Closeness to LDA topic 0
* LDA_01: Closeness to LDA topic 1
* LDA_02: Closeness to LDA topic 2
* LDA_03: Closeness to LDA topic 3
* LDA_04: Closeness to LDA topic 4
* global_subjectivity: Text subjectivity
* global_sentiment_polarity: Text sentiment polarity
* global_rate_positive_words: Rate of positive words in the content
* global_rate_negative_words: Rate of negative words in the content
* rate_positive_words: Rate of positive words among non-neutral tokens
* rate_negative_words: Rate of negative words among non-neutral tokens
* avg_positive_polarity: Avg. polarity of positive words
* min_positive_polarity: Min. polarity of positive words
* max_positive_polarity: Max. polarity of positive words
* avg_negative_polarity: Avg. polarity of negative words
* min_negative_polarity: Min. polarity of negative words
* max_negative_polarity: Max. polarity of negative words
* title_subjectivity: Title subjectivity
* title_sentiment_polarity: Title polarity
* abs_title_subjectivity: Absolute subjectivity level
* abs_title_sentiment_polarity: Absolute polarity level
* shares: Number of shares (target)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import requests
from bs4 import BeautifulSoup

sns.set_style('whitegrid')

# Data cleaning

In [None]:
rerun_basic_data_cleaning = False

In [None]:
if rerun_basic_data_cleaning:
    df = pd.read_csv('onlinenews.csv')
    df.columns = df.columns.map(lambda x: x.strip())
    df = df.rename(columns={'self_reference_avg_sharess':'self_reference_avg_shares'})
else:
    df = pd.read_csv('onlinenews_modified.csv')

In [None]:
def get_data_channel(url):
    page = requests.get(df.loc[1]['url'])
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup.select('hgroup[data-channel]>h2')[0].get_text().lower()

In [None]:
if rerun_basic_data_cleaning:
    # date column
    df['date'] = df['url'].map(lambda x: '/'.join(x.split('/')[3:6][::-1]))
    
    # unify weekday columns
    df['weekday'] = 0
    for i, day in enumerate(['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']):
        df['weekday'] += (i + 1) * df[f'weekday_is_{day}']
    df = df.drop([i for i in df.columns if 'weekday_is' in i], axis=1)
    
    # replace data_channel_* features with single data_channel feature
    df['data_channel'] = ''
    data_channels = [i for i in df.columns if 'data_channel_' in i]
    for c in data_channels:
        df.loc[df[c] == 1,'data_channel'] = c.split('_')[-1]
    df = df.drop(data_channels,axis=1)
    
    # get missing data_channel values
    values = df[df['data_channel']=='']['data_channel'].copy()
    for i in df[df['data_channel']==''].index:
        try:
            values.loc[i] = get_data_channel(df.loc[i,'url'])
        except:
            1
    df.loc[df['data_channel']=='','data_channel'] = values

    df.loc[21386,'data_channel'] = 'world'
    df.loc[17003,'data_channel'] = 'entertainment'
    df = df.drop(622).reset_index().drop('index', axis=1)
    
    df.loc[df['data_channel']=='business','data_channel'] = 'bus'
    
    # save to csv
    df.to_csv('onlinenews_modified.csv', index=False)

# Data Analysis

In [None]:
# cols = ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
#         'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs',
#         'num_imgs', 'num_videos', 'average_token_length', 'num_keywords', 'is_weekend',
#         'global_subjectivity', 'title_subjectivity', 'title_sentiment_polarity',
#         'global_sentiment_polarity', 'rate_positive_words', 'rate_negative_words',
#         'data_channel', 'shares']
cols = ['n_tokens_title', 'n_tokens_content', 'num_imgs', 'num_videos', 'num_keywords', 'is_weekend',
        'global_subjectivity', 'title_subjectivity', 'title_sentiment_polarity',
        'global_sentiment_polarity', 'rate_positive_words',
        'data_channel', 'shares']
df = df[cols]
t_label = 'is_weekend'
y_label = 'shares'

### Outliers

In [None]:
percentile = 0.99
percentile_value = df[y_label].quantile(0.99)
print(f'Percentile value: {percentile_value:.0f}')
print(f'Max value: {df["shares"].max()}' )

In [None]:
df[df[y_label] < percentile_value][y_label].hist()

### Correlation

In [None]:
sns.heatmap(df.corr(method='pearson'), cmap='vlag')

### data-channel counts

In [None]:
df.groupby('data_channel')['data_channel'].count().plot(kind='bar')
plt.title('Channel counts')

## Weekend vs during week row count

In [None]:
df[t_label].value_counts()

# Preperation for the models

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from utils import factorize, propensity_func, trim_common_support, \
    balance_weights_for_histplot

np.random.seed(101)

In [None]:
import importlib
import utils
importlib.reload(utils)

In [None]:
factorize(df)

# Propensity estimation

In [None]:
test_size = 0.2
df_train, df_test = train_test_split(df, test_size=test_size)
x = df.drop([t_label, y_label], axis=1)
x_train = df_train.drop([t_label, y_label], axis=1)
x_test = df_test.drop([t_label, y_label], axis=1)
t = df[t_label]
t_train = df_train[t_label]
t_test = df_test[t_label]

In [None]:
propensity_estimators = {
    "log": propensity_func(df_train, solver='liblinear', penalty='l2'),
    "random_forest":  propensity_func(df_train, method='random_forest', max_depth=7, \
                                min_samples_leaf=5),
    "boosting": propensity_func(df_train, method='boosting', learning_rate=0.015, \
                                n_estimators=400, max_depth=7),
}

In [None]:
print("auroc:")
for method, estimator in propensity_estimators.items():
    print(f"  {method:<15}: {roc_auc_score(t_test, estimator(x_test))}")

In [None]:
propensity_scores = propensity_estimators["boosting"](x)
df['propensity'] = propensity_scores

In [None]:
sns.histplot(df, x='propensity', bins=20, hue=t_label)
plt.xlabel('propensity score')
plt.ylabel('number of articles')
plt.show()

In [None]:
df = trim_common_support(df, t_label)

In [None]:
sns.histplot(df, x='propensity', hue=t_label,bins=20, weights=balance_weights_for_histplot(df))

# ATE estimation

In [None]:
from ate_estimators import ipw_ate, matching_ate, s_learner_ate, t_learner_ate, \
    x_learner_ate

In [None]:
df_no_prop = df.drop('propensity', axis=1)
ates = pd.DataFrame(
    dict(
        ipw_ate=ipw_ate(df_no_prop, df['propensity']),
        matching_ate=matching_ate(df_no_prop),
        s_learner_ate=s_learner_ate(df_no_prop),
        t_learner_ate=t_learner_ate(df_no_prop),
        x_learner_ate=x_learner_ate(df_no_prop, df['propensity']),
    ).items(),
    columns=['Type', 'ATE']
)
ates.set_index('Type')

# ### old

### Weekday histogram per data channel

In [None]:
counts = {}
for i in df['data_channel'].unique():
    counts[i] = df[df['data_channel']==i].groupby('weekday')[y_label].count()
    counts[i].plot(kind='bar', title=i)
    plt.show()

In [None]:
sns.heatmap(pd.DataFrame(counts).corr(), annot=True)
plt.title('Week-day number of shares correlation')

In [None]:
cols = ['n_tokens_title', 'n_tokens_content', 'n_unique_words',
        'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs',
        'num_imgs', 'num_videos', 'average_token_length', 'num_keywords']

In [None]:
df.info()

In [None]:
df['self_reference_avg_shares']

In [None]:
avg_shares_per_article = df.groupby(['data_channel','weekday'])[y_label].sum() /  df.groupby(['data_channel','weekday'])[y_label].count()

In [None]:
norm_avg_shares_per_article = avg_shares_per_article / avg_shares_per_article.groupby('data_channel').sum()

In [None]:
vals = pd.DataFrame(norm_avg_shares_per_article).pivot_table(values=y_label,index='data_channel',columns='weekday')

In [None]:
sns.heatmap(vals*100, annot=True)
plt.title('Week-day number of shares')