In [1]:
#Load Required Modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn

import time
import copy

In [2]:
#Load Metric Modules
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [3]:
#Load Data and Test Files
data = pd.read_csv('data.csv', index_col=0)
print('Data Shape: {}'.format(data.shape))
test = pd.read_csv('test.csv', index_col=0)
print('Test Shape: {}'.format(test.shape))

Data Shape: (616656, 19)
Test Shape: (154165, 18)


In [4]:
# PreProcess: add Market Share_total to Test Dataset
test['Market Share_total'] = 0

In [5]:
# PreProcess: Remove NaN Rows
#data = data.dropna()
#data = data.reset_index(drop=True)
#test = test.dropna()
#test = test.reset_index(drop=True)

In [6]:
# Put Data together for covert them into Categorical & Normalization
frames = [data, test]
datafrs = pd.concat(frames, keys=['data','test'], sort=False)

In [7]:
#datafrs = datafrs.loc['data'].reset_index(drop=True)
datafrs = datafrs.drop(columns=['Episode','Start_time','End_time', '# of episode in the season', 'Name of episode', 'Temperature in Montreal during episode', 'Year'])

In [8]:
#datafrs['Episode'] = pd.factorize(datafrs['Episode'])[0]
datafrs['Station'] = pd.factorize(datafrs['Station'])[0]
datafrs['Channel Type'] = pd.factorize(datafrs['Channel Type'])[0]
datafrs['Season'] = pd.factorize(datafrs['Season'])[0]
#datafrs['Year'] = pd.factorize(datafrs['Year'])[0]
datafrs['Date'] = pd.factorize(datafrs['Date'])[0]
datafrs['Day of week'] = pd.factorize(datafrs['Day of week'])[0]
datafrs['Name of show'] = pd.factorize(datafrs['Name of show'])[0]
datafrs['Genre'] = pd.factorize(datafrs['Genre'])[0]
datafrs['First time or rerun'] = pd.factorize(datafrs['First time or rerun'])[0]
datafrs['Movie?'] = pd.factorize(datafrs['Movie?'])[0]
datafrs['Game of the Canadiens during episode?'] = pd.factorize(datafrs['Game of the Canadiens during episode?'])[0]
#data['Start_time'] = pd.factorize(data['Start_time'])[0]
#data['End_time'] = pd.factorize(data['End_time'])[0]
#data['Name of episode'] = pd.factorize(data['Name of episode'])[0]

In [9]:
cols = datafrs.columns.tolist()
cols = cols[:-2] + cols[-1:] + cols[-2:-1]
datafrs = datafrs[cols]
datafrs

Unnamed: 0,Unnamed: 1,Station,Channel Type,Season,Date,Day of week,Length,Name of show,Genre,First time or rerun,Movie?,Market Share_total,Game of the Canadiens during episode?
data,1,0,0,0,0,0,8,0,0,0,0,0.9,0
data,2,0,0,0,0,0,2,1,1,0,0,0.5,0
data,3,0,0,0,0,0,2,2,2,0,0,0.3,0
data,4,0,0,0,0,0,4,3,3,0,0,1.7,0
data,5,0,0,0,0,0,2,4,4,0,0,2.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,154161,21,1,3,1091,6,4,5839,15,0,0,0.0,0
test,154162,21,1,3,1091,6,4,569,13,0,0,0.0,0
test,154163,21,1,3,1091,6,6,1850,16,0,1,0.0,0
test,154164,21,1,3,1091,6,10,7057,16,0,1,0.0,0


In [10]:
def norm(x):
    return (x - x.mean()) / x.std()

In [11]:
datafrs_norm = datafrs[datafrs.columns[:-1]]
datafrs_norm = datafrs_norm.apply(norm)

datafrs_norm['Market Share_total'] = datafrs['Market Share_total']

In [12]:
datafrs_norm

Unnamed: 0,Unnamed: 1,Station,Channel Type,Season,Date,Day of week,Length,Name of show,Genre,First time or rerun,Movie?,Market Share_total
data,1,-1.696978,-2.552046,-1.131690,-1.748152,-1.497928,0.906181,-0.940207,-1.401938,-0.16533,-0.102460,0.9
data,2,-1.696978,-2.552046,-1.131690,-1.748152,-1.497928,-0.223753,-0.939687,-1.257463,-0.16533,-0.102460,0.5
data,3,-1.696978,-2.552046,-1.131690,-1.748152,-1.497928,-0.223753,-0.939168,-1.112987,-0.16533,-0.102460,0.3
data,4,-1.696978,-2.552046,-1.131690,-1.748152,-1.497928,0.152892,-0.938649,-0.968512,-0.16533,-0.102460,1.7
data,5,-1.696978,-2.552046,-1.131690,-1.748152,-1.497928,-0.223753,-0.938130,-0.824037,-0.16533,-0.102460,2.2
...,...,...,...,...,...,...,...,...,...,...,...,...
test,154161,1.518830,0.391842,1.482757,1.724433,1.517127,0.152892,2.091796,0.765191,-0.16533,-0.102460,0.0
test,154162,1.518830,0.391842,1.482757,1.724433,1.517127,0.152892,-0.644743,0.476240,-0.16533,-0.102460,0.0
test,154163,1.518830,0.391842,1.482757,1.724433,1.517127,0.529536,0.020438,0.909666,-0.16533,9.759931,0.0
test,154164,1.518830,0.391842,1.482757,1.724433,1.517127,1.282825,2.724264,0.909666,-0.16533,9.759931,0.0


In [13]:
datafrs

Unnamed: 0,Unnamed: 1,Station,Channel Type,Season,Date,Day of week,Length,Name of show,Genre,First time or rerun,Movie?,Market Share_total,Game of the Canadiens during episode?
data,1,0,0,0,0,0,8,0,0,0,0,0.9,0
data,2,0,0,0,0,0,2,1,1,0,0,0.5,0
data,3,0,0,0,0,0,2,2,2,0,0,0.3,0
data,4,0,0,0,0,0,4,3,3,0,0,1.7,0
data,5,0,0,0,0,0,2,4,4,0,0,2.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,154161,21,1,3,1091,6,4,5839,15,0,0,0.0,0
test,154162,21,1,3,1091,6,4,569,13,0,0,0.0,0
test,154163,21,1,3,1091,6,6,1850,16,0,1,0.0,0
test,154164,21,1,3,1091,6,10,7057,16,0,1,0.0,0


In [14]:
df_train = datafrs_norm.loc['data']
X = df_train.drop('Market Share_total', axis=1).to_numpy()
Y = df_train['Market Share_total'].to_numpy()

In [15]:
#Split Data into 3 Sections: Train, Validation and Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=56)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=65)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)
n_train = X_train.shape[0]
X = torch.tensor(X, dtype=torch.float)
X_train = torch.tensor(X_train, dtype=torch.float)
X_val = torch.tensor(X_val, dtype=torch.float)
X_test = torch.tensor(X_test, dtype=torch.float)
Y = torch.tensor(Y, dtype=torch.float).view(-1, 1)
Y_train = torch.tensor(Y_train, dtype=torch.float).view(-1, 1)
Y_val = torch.tensor(Y_val, dtype=torch.float).view(-1, 1)
Y_test = torch.tensor(Y_test, dtype=torch.float).view(-1, 1)

(499491, 10)
(55499, 10)
(61666, 10)
(499491,)
(55499,)
(61666,)


In [16]:
def train_model(model, dataloaders, optimizer, loss_func, num_epochs=10):
    since = time.time()
    val_loss_history = []
    best_model_wts = copy.deepcopy(net.state_dict())
    best_loss = np.inf
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        for phase in ['train', 'val']:
            running_loss = 0.0
            if phase == 'train':
                model.train()
            else:
                model.eval()
            for inputs, labels in dataloaders[phase]:
                try:
                    inputs = inputs.to(device)
                    labels = labels.to(device)
                except:
                    pass
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = loss_func(outputs, labels)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                running_loss = loss.item()
                #running_corrects += torch.sum(outputs == labels.data)
            epoch_loss = running_loss
            #epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            #print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            print('{} Loss: {:.4f}'.format(phase, epoch_loss))
            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_loss_history.append(epoch_loss)
        print()
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_loss))
    model.load_state_dict(best_model_wts)
    return model, val_loss_history

In [17]:
#Neural Network To Train - Use Simple Network due to linear regression
w_num = X_train.shape[1]
net = torch.nn.Sequential(
    torch.nn.Linear(w_num, w_num * 2),
    torch.nn.Linear(w_num * 2, w_num * 3),
    torch.nn.Linear(w_num * 3, w_num * 4),
    nn.ReLU(inplace=True),
    torch.nn.Dropout(),
    torch.nn.Linear(w_num * 4, w_num * 3),
    torch.nn.Linear(w_num * 3, w_num * 2),
    torch.nn.Linear(w_num * 2, w_num),
    torch.nn.Linear(w_num, 1)
)

for nl in net:
    if nl._get_name() == 'Linear':
        torch.nn.init.normal_(nl.weight, mean=0, std=0.1)
        torch.nn.init.constant_(nl.bias, val=0)

In [18]:
#Numpy Array to Tensor
dataset_tr = torch.utils.data.TensorDataset(X, Y)
dataset_val = torch.utils.data.TensorDataset(X_val, Y_val)
dataset_test = torch.utils.data.TensorDataset(X_test, Y_test)
dataloaders_dict = {'train' : torch.utils.data.DataLoader(dataset_tr, batch_size=10, shuffle=True)}
dataloaders_dict['val'] = torch.utils.data.DataLoader(dataset_val, batch_size=10, shuffle=True)

#Set Loss Function and optimizer
loss = torch.nn.L1Loss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [19]:
#If running on CPU comment these two lines below
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#net = net.to(device)
model_tr, hist = train_model(net, dataloaders_dict, optimizer, loss, num_epochs=10)

Epoch 0/9
----------
train Loss: 1.1488
val Loss: 4.8083

Epoch 1/9
----------
train Loss: 1.9275
val Loss: 2.7153

Epoch 2/9
----------
train Loss: 2.8278
val Loss: 2.3009

Epoch 3/9
----------
train Loss: 0.5627
val Loss: 0.5051

Epoch 4/9
----------
train Loss: 0.9811
val Loss: 2.3748

Epoch 5/9
----------
train Loss: 1.9710
val Loss: 1.0170

Epoch 6/9
----------
train Loss: 0.9662
val Loss: 1.2142

Epoch 7/9
----------
train Loss: 0.9074
val Loss: 1.8685

Epoch 8/9
----------
train Loss: 1.8538
val Loss: 2.1739

Epoch 9/9
----------
train Loss: 0.8427
val Loss: 1.9121

Training complete in 7m 57s
Best val Acc: 0.505139


In [20]:
#print('MAE Error: {:4f}'.format(loss(model_tr(X_test), Y_test).item()))
try:
    X_test = X_test.to(device)
    print('MAE: {:4f}'.format(Y_test.numpy(), mean_absolute_error(model_tr(X_test).cpu().detach().numpy())))
    print('R-Squared Error: {:4f}'.format(Y_test.numpy(), r2_score(model_tr(X_test).cpu().detach().numpy())))
except: 
    print('MAE: {:4f}'.format(mean_absolute_error(Y_test.numpy(), model_tr(X_test).detach().numpy())))
    print('R-Squared Error: {:4f}'.format(r2_score(Y_test.numpy(), model_tr(X_test).detach().numpy())))    

MAE: 1.845284
R-Squared Error: 0.546092


In [21]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures as pf

# Create linear regression object
regr = linear_model.LinearRegression()

regr_pf = linear_model.LinearRegression()

X_ = pf(degree=3, include_bias=True).fit_transform(X)

# Train the model using the training sets
regr.fit(X, Y)
regr_pf.fit(X_, Y)

# Make predictions using the testing set
y_pred = regr.predict(X_test)
y_pred_pf = regr_pf.predict(pf(degree=3, include_bias=True).fit_transform(X_test))

# The mean squared error
print('Mean absolute error: %.2f' % mean_absolute_error(Y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(Y_test, y_pred))


# The mean squared error
print('Mean absolute error: %.2f' % mean_absolute_error(Y_test, y_pred_pf))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(Y_test, y_pred_pf))

Mean absolute error: 2.33
Coefficient of determination: 0.34
Mean absolute error: 1.88
Coefficient of determination: 0.61


In [22]:
#Prediction For Test dataset
df_test = datafrs_norm.loc['test']
df_test = df_test.drop('Market Share_total', axis=1).to_numpy()
df_test = torch.tensor(df_test, dtype=torch.float)
try:
    df_test = df_test.to(device)
except:
    pass
pred = model_tr(df_test)

In [23]:
try:
    pred = pred.cpu().detach().numpy()
except:
    pred = pred.detach().numpy()

In [24]:
test = pd.read_csv('test.csv', index_col=0)
test['Market Share_total'] = pred

In [25]:
test.to_csv('test_pred.csv')