In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
filepath = 'OnlineNewsPopularity.csv'
dataname = 'OnlineNewsPopularity'

In [None]:
df = pd.read_csv(filepath)
df.drop(columns='url', inplace=True)
df

In [None]:
# data_link = 'https://archive.ics.uci.edu/dataset/186/wine+quality'
# print("|[{}]({})| {} | {} | mixed|".format(dataname, data_link, len(df), len(df.columns[:-1])))

In [None]:
df.columns = df.columns.str.strip()
df.columns

In [None]:
channel_cols = [c for c in df.columns if 'channel' in c]
channel_cols

In [None]:
channel_cols = [c for c in df.columns if 'channel' in c]
df['channel_is_all_zero'] = df[channel_cols].sum(axis=1) == 0
df['channel_is_all_zero'] = df['channel_is_all_zero'].apply(lambda x: 1 if x else 0)
channel_cols = [c for c in df.columns if 'channel' in c]
df[channel_cols]

In [None]:
channel_cols_dict = {c: i for i, c in enumerate(channel_cols)}
df['channel'] = df[channel_cols].idxmax(axis=1)
df['channel'] = df['channel'].apply(lambda x: channel_cols_dict[x])
df['channel'].value_counts()

In [None]:
weekday_cols = [c for c in df.columns if 'weekday' in c]
weekday_cols_dict = {c: i for i, c in enumerate(weekday_cols)}
df['weekday'] = df[weekday_cols].idxmax(axis=1)
df['weekday'] = df['weekday'].apply(lambda x: weekday_cols_dict[x])
df['weekday'].value_counts()

In [None]:
df.drop(columns=channel_cols, inplace=True)
df.drop(columns=weekday_cols, inplace=True)
len(df.columns)

In [None]:
df_y = df['shares']
df = df.drop('shares',axis=1)
df.insert(48,'shares', df_y)
df

In [None]:
# label
df.iloc[:, -1].describe()

In [None]:
info_filepath = dataname+'.info'
with open(info_filepath, 'w') as f:
    for col in df.columns[:-1]:
        if len(df[col].unique()) <= 10:
            print(col, df[col].unique())
            f.write(col + ' discrete\n')
        else:
            f.write(col + ' continuous\n')
    f.write(df.columns[-1] + ' continuous\n')  # consider target value as continuous variable
    f.write('LABEL_POS -1')

In [None]:
df.to_csv(dataname+'.data', index=False, header=False)

# Data Analysis

In [None]:
new_df = df.copy()
new_df['shares'] = np.log(new_df['shares'])
new_df['shares'].max()

In [None]:
# y label distribution
plt.figure(num = None, figsize = (10,5), dpi = 80, facecolor = 'w', edgecolor = 'k')
# df.iloc[:, -1].hist()
sns.distplot(new_df.iloc[:, -1], hist=False, rug=False, kde_kws={"shade": True})
plt.ylabel('counts')
plt.title(f'{new_df.columns[-1]} distribution')
plt.savefig('figs/'+dataname+'_label_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# y label distribution
plt.figure(num = None, figsize = (15,5), dpi = 80, facecolor = 'w', edgecolor = 'k')
# df.iloc[:, -1].hist()
savenames = ['', 'log']
dfs = [df, new_df]
for i in range(1, 3):
    ax = plt.subplot(1, 2, i)
    sns.distplot(dfs[i-1].iloc[:, -1], hist=True, rug=False, kde_kws={"shade": True})
    ax.set_ylabel('counts')
    ax.set_title(f'{savenames[i-1]} {df.columns[-1]} distribution')
plt.savefig('figs/'+dataname+'_label_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
corrs = new_df.corr().values
drop_cols = []
remain_cols = []
threshold = 0.01
for i in range(len(corrs)-1):
    if corrs[i, -1] > threshold or corrs[i, -1] < -threshold:
        remain_cols.append(df.columns[i])
        # print(df.columns[i], corrs[i, -1])
    else:
        drop_cols.append(df.columns[i])
        print(df.columns[i], corrs[i, -1])
print(len(drop_cols), drop_cols)