In [1]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

data = pd.read_csv('raw_data.csv')

In [2]:
X, y = data[['harrypotterwiki', 'hogwartsishere', 'jkrowling', 'harrypotterlexicon', 'mugglenet', 'wizardingworld']], data[['pottermore']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 100)

In [3]:
reg_model = linear_model.LinearRegression().fit(X_train, y_train)

In [4]:
y_pred= reg_model.predict(X_test)  
x_pred= reg_model.predict(X_train)

mae = metrics.mean_absolute_error(y_test, y_pred)
mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
print('Mean Absolute Error:', mae)
print('Mean Absolute Percentage Error:', mape)
print('Mean Square Error:', mse)

Mean Absolute Error: 17.183195604730418
Mean Absolute Percentage Error: 1.1785504109182832
Mean Square Error: 425.3462122278077


In [5]:
print('Intercept: ',reg_model.intercept_)
coef = {}
for item in list(zip(X_train.columns.tolist(),reg_model.coef_[0])):
    print(f'{item[0]}: {item[1]}')
    coef[item[0]] = item[1]

data['weighted_searches'] = (
    data['harrypotterwiki'] * coef['harrypotterwiki'] +
    data['hogwartsishere'] * coef['hogwartsishere'] +
    data['jkrowling'] * coef['jkrowling'] +
    data['harrypotterlexicon'] * coef['harrypotterlexicon'] +
    data['mugglenet'] * coef['mugglenet'] +
    data['wizardingworld'] * coef['wizardingworld'])

Intercept:  [17.54886013]
harrypotterwiki: -0.1732519835666518
hogwartsishere: -0.026006443515529748
jkrowling: 0.029119305118605967
harrypotterlexicon: 0.10688512945126766
mugglenet: -0.05485459018477699
wizardingworld: 0.35897980502634536


In [10]:
new_data = pd.DataFrame(data['pottermore'])
new_data['searches'] = data['weighted_searches']
new_data['date'] = data['date']
new_data.set_index(keys='date', inplace=True)
new_data.head()

Unnamed: 0_level_0,pottermore,searches
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01,10,5.743677
2016-01-02,11,11.287644
2016-01-03,10,9.034139
2016-01-04,8,2.337842
2016-01-05,8,1.214174


In [12]:
new_data.to_csv('data.csv')