In [None]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Import necessary libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
sns.set(rc={'figure.figsize':(10, 8)}); # you can change this if needed

Load the data. They are located in the input directory.


In [None]:
df = pd.read_csv('../input/OnlineNewsPopularityReduced.csv')
df.describe()

There are some 0 words articles, they should be deleted

In [None]:
print((df.n_tokens_content == 0).sum(), 'articles with 0 words')
df[df.n_tokens_content == 0].head()

In [None]:
df = df[df.n_tokens_content != 0]

In [None]:
sns.stripplot(df.shares)

Will be better to just delete this one outlier on the right.

In [None]:
df[df.shares == 284700]

In [None]:
df = df.drop(584).reset_index(drop=True)

In [None]:
pd.concat([df.dtypes, df.nunique()], axis=1).transpose()

In [None]:
df.columns

Adding some useful features.

In [None]:
df['data_channel_is_none'] = 1 - df[['data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world']].sum(1)
df['data_channel'] = df[['data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'data_channel_is_none']].idxmax(axis=1).str[16:].str.capitalize()
df['weekday'] = df[['weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday']].idxmax(axis=1).str[11:].str.capitalize()
channels = ['Lifestyle',
       'Entertainment', 'Bus',
       'Socmed', 'Tech',
       'World', 'None']
days = ['Monday', 'Tuesday',
       'Wednesday', 'Thursday', 'Friday',
       'Saturday', 'Sunday']

**kw_min_min** column has only one value so it can be dropped.

In [None]:
df.drop(columns='kw_min_min', inplace=True)

#### Read the description of the dataset, define the target variable and the main task of the forecasting model.

Target column is **shares**, which represents number of shares for an article.
Therefore the main task of forecasting model will be to predict how popular will be some news article measured in number of it's shares.

#### On what day of the week were most of the articles in the dataset published? On what day were the fewest articles published? Visualize the information about the days of the week.

In [None]:
days_count = df.groupby('weekday').weekday.count()[days]
a = days_count.plot(kind='bar')
a.set_xlabel('Day of week')
a.set_ylabel('Count')

We can see that there are lots of articles coming out in weekdays (with Tuesday being top 1) and 2-3 times less in weekends. The fewest articles are published on Saturday.

#### What can you say about the distribution of values of the **n_tokens_title** variable? How is the length of the article title related to the result variable?

In [None]:
print(df.n_tokens_title.mean(), df.n_tokens_title.var())
df.n_tokens_title.hist(bins=20)

Distribution of **n_token_title** is similar to normal with mean = 11.2 and variance = 4.8

In [None]:
r = spearmanr(df['shares'], df['n_tokens_title'])
print('Spearman correlation:', r[0], 'p-value:', r[1])
df.plot('n_tokens_title', 'shares', kind='scatter')

In [None]:
print(df[(df.n_tokens_title < 7) | (df.n_tokens_title > 16)].shape[0])
print(df[df.shares > 10000].shape[0])

Only titles that contain 7-16 words are able to get more than ~10000 shares, but that can be because of low amount of data about other lengthes. Also there are very low Spearman correlation.

#### Which factor has a greater impact on the popularity of an article --- the number of pictures or videos?

In [None]:
r = spearmanr(df['shares'], df[['num_imgs', 'num_videos']])
print('Spearman correlation with num_imgs:', r[0][0, 1], 'p-value:', r[1][0, 1])
print('Spearman correlation with num_videos:', r[0][0, 2], 'p-value:', r[1][0, 2])

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 7), sharey=True)
df.plot('num_imgs', 'shares', kind='scatter', ax=axes[0])
df.plot('num_videos', 'shares', kind='scatter', ax=axes[1])

There are low yet statistically valuable correlation with **num_videos** and decent correlation with **num_img**. Most popular articles (>10000 shares) have small amount of videos and images.

#### Is it true that articles published on weekends are on average more popular than articles published on weekdays? Come up with a visualization.

In [None]:
shares_by_day = df.groupby('weekday')['shares'].mean()[days]
print(shares_by_day)
a = shares_by_day.plot(kind='bar')
a.set_ylabel('Mean shares')
a.set_xlabel('Day of week')

Articles indeed are more popular when being published on weekends, especially on Sunday.

#### Is there a relationship between the length of the text (number of words) and the popularity of the article?

In [None]:
r = spearmanr(df['shares'], df['n_tokens_content'])
print('Spearman correlation:', r[0], 'p-value:', r[1])
df.plot('n_tokens_content', 'shares', kind='scatter')

There are small negative correlation between these.

#### What other insights can be found in the available data? Creative task.

How data channel affects popularity?

In [None]:
days_count = df.groupby('data_channel').shares.aggregate(['count', 'mean'])
print(days_count)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 7))
days_count['count'].plot(kind='bar', ax=axes[0])
days_count['mean'].plot(kind='bar', ax=axes[1])
axes[0].set_ylabel('Count')
axes[1].set_ylabel('Mean shares')

Most frequent data channel, world, also has least mean shares.

In [None]:
data = df.drop(columns=['weekday', 'data_channel', 'url'])

Let's calculate correlations between **shares** and every other column

In [None]:
corrs = pd.DataFrame(columns=data.columns).transpose()
for c in data.columns:
        corrs.loc[c, 'c'], corrs.loc[c, 'p'] = spearmanr(data['shares'], data[c])
corrs['absc'] = corrs.c.abs()
corrs = corrs.sort_values('absc', ascending=False)

In [None]:
corrs.head(25)

All of the strongest correlations are also statistically significant, so these parameters will be most important to look at.

Some interesting columns:
- **global_subjectivity:** the more subjective article is, the more popular it will be
- **is_weekend:** articles get more popular when posted on weekends
- **rate_positive_words, global_rate_positive_words:** better to use more positive words in the article

In [None]:
corrs.loc['global_rate_negative_words']

And using of more negative words slightly correlates with less popular articles.

In [None]:
sns.pairplot(data[corrs.iloc[:7].index])