#### Load Libraries

In [1]:
import pandas as pd
import numpy as np


#### Define variables

In [2]:
l_parse_date_cols = ['dt_prediction_date', 'dt_target_date', 'dt_flight_date']
l_target_cols = ['num_pax_000_014_mins_before_sdt', 'num_pax_015_029_mins_before_sdt', 'num_pax_030_044_mins_before_sdt', 'num_pax_045_059_mins_before_sdt', 'num_pax_060_074_mins_before_sdt', 'num_pax_075_089_mins_before_sdt', 'num_pax_090_104_mins_before_sdt', 'num_pax_105_119_mins_before_sdt',  'num_pax_120_134_mins_before_sdt', 'num_pax_135_149_mins_before_sdt', 'num_pax_150_164_mins_before_sdt', 'num_pax_165_179_mins_before_sdt',  'num_pax_180_194_mins_before_sdt', 'num_pax_195_209_mins_before_sdt', 'num_pax_210_224_mins_before_sdt', 'num_pax_225_239_mins_before_sdt', 'num_pax_240plus_mins_before_sdt']


#### Function to calculate score

In [3]:
from sklearn.metrics import mean_squared_error
def calculate_score(df_target_cases, df_predictions):
    '''Root-mean-squared error is the chosen error metric. This function calculates and returns the root-mean-squared error'''
    f_rmse = np.sqrt(mean_squared_error(df_target_cases, df_predictions))
    return f_rmse

#### Read in csv file and set index as id

In [4]:
train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\data_20161116\train.csv")
train = train.set_index('id')
test = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\data_20161116\test.csv")
test = test.set_index('id')

#### Do not include the nans in the test

In [5]:
test2 = test[pd.isnull(test).any(axis=1)]

#### Set target columns

In [6]:
target = train[l_target_cols]

#### Drop the target columns from the training and test dataset

In [7]:
df1 = train.drop(l_target_cols,axis = 1)
test2 = test2.drop(l_target_cols,axis = 1) 

#### Concatenate the 2 data sets

In [8]:
df = df1.append(test2, ignore_index = True)

#### Categorical variables as integers

In [9]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df['cat_case_type'] = class_le.fit_transform(df['cat_case_type'].values)
df['cat_s_plane_capacity'] = class_le.fit_transform(df['cat_s_plane_capacity'].values)
df['dt_flight_date'] = class_le.fit_transform(df['dt_flight_date'].values)
df['dt_prediction_date'] = class_le.fit_transform(df['dt_prediction_date'].values)
df['dt_target_date'] = class_le.fit_transform(df['dt_target_date'].values)
df['s_model_type'] = class_le.fit_transform(df['s_model_type'].values)

#### Normalize

In [10]:
from sklearn import preprocessing
normalized_df = preprocessing.normalize(df)

#### Split the train and test data sets

In [11]:
X = normalized_df[:train.shape[0]]
test1 = normalized_df[train.shape[0]:]

In [56]:
print (str(len(X))+" rows for training set")
print (str(len(test1))+" rows for test set")
print (str(len(target))+" rows for target")

98043 rows for training set
4577 rows for test set
98043 rows for target


#### Split the data set for checking the score

In [57]:
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], target[:offset]
X_test, y_test = X[offset:], target[offset:]

In [58]:
print (str(len(X_train))+" rows for training set")
print (str(len(y_train))+" rows for test set")

88238 rows for training set
88238 rows for test set


#### Fit regression model

In [59]:
#from sklearn.linear_model import LinearRegression
#model = linear_model.LinearRegression()
# 2* Ridge
#model = linear_model.Ridge(alpha=0.003)
# 3* Linear
#model = linear_model.LinearRegression()
#from sklearn.neighbors import KNeighborsRegressor
#model = KNeighborsRegressor(n_neighbors=300,n_jobs=-1)

from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=.001,random_state=5)

model.fit(X_train, y_train)
prediction_test = clf.predict(X_test)
mse = mean_squared_error(y_test, prediction_test )
print("MSE: %.4f" % mse)

MSE: 59.8135


In [60]:
###############################################
# 6. Pass the predictions through our error function to get the model scroe
###############################################
f_rmse = calculate_score(y_test, clf.predict(X_test))
print('The root-mean-squared error is ' + str(f_rmse))

The root-mean-squared error is 7.73391636662


#### Predict the test data set with Elastic Net

In [13]:
from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=.001,random_state=5)
model.fit(X, target)
predictions_1 = model.predict(test1).astype(int)

#### Predictions array

In [16]:
predictions_1

array([[0, 0, 2, ..., 0, 0, 1],
       [0, 0, 2, ..., 0, 0, 1],
       [0, 0, 2, ..., 0, 0, 1],
       ..., 
       [0, 0, 1, ..., 1, 0, 1],
       [0, 0, 1, ..., 1, 0, 1],
       [0, 0, 1, ..., 1, 0, 1]])

#### Reshape the predictions to a dataframe

In [17]:
df4 = predictions_1.reshape((-1,1))

#### Assign the value to each column

In [129]:
df5 = pd.DataFrame({'num_pax_000_014_mins_before_sdt':df4[:,0],'num_pax_015_029_mins_before_sdt':df4[:,1],'num_pax_030_044_mins_before_sdt':df4[:,2],
                    'num_pax_045_059_mins_before_sdt':df4[:,3],'num_pax_060_074_mins_before_sdt':df4[:,4],'num_pax_075_089_mins_before_sdt':df4[:,5],
                    'num_pax_090_104_mins_before_sdt':df4[:,6],'num_pax_105_119_mins_before_sdt':df4[:,7],'num_pax_120_134_mins_before_sdt':df4[:,8],
                    'num_pax_135_149_mins_before_sdt':df4[:,9],'num_pax_150_164_mins_before_sdt':df4[:,10],'num_pax_165_179_mins_before_sdt':df4[:,11],
                    'num_pax_180_194_mins_before_sdt':df4[:,12],'num_pax_195_209_mins_before_sdt':df4[:,13],'num_pax_210_224_mins_before_sdt':df4[:,14],
                    'num_pax_225_239_mins_before_sdt':df4[:,15],'num_pax_240plus_mins_before_sdt':df4[:,16]})

In [130]:
submission = pd.DataFrame()
submission["id"] = test2.index

In [131]:
df6 = pd.concat([submission, df5], axis=1)

In [132]:
df6.to_csv("attempt_PiushVaish_Normalized_ElasticNet.csv",index = False)