# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import SGD

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

# Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print('train shape:',train.shape)
print('test shape:',test.shape)

train shape: (250000, 102)
test shape: (150000, 101)


In [3]:
train.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


In [4]:
# Train and validation data
X_train, X_valid = np.split(train.sample(frac=1, random_state=42), 
                       [int(.9*len(train))])

y_train = X_train['loss'].values
y_valid = X_valid['loss'].values

X_train = X_train.drop(columns = ['loss','id'])
X_valid = X_valid.drop(columns = ['loss','id'])


# Test data
X_test = test.drop(columns = ['id'])

print('Train set:', X_train.shape)
print('Validation set:', X_valid.shape)
print('Test set:', X_test.shape)

Train set: (225000, 100)
Validation set: (25000, 100)
Test set: (150000, 100)


In [5]:
#preprocess data

features_num = list(X_train.columns[0:99])

preprocessor = make_column_transformer(
    (StandardScaler(), features_num)
)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_test = preprocessor.fit_transform(X_test)

y_train = y_train
y_valid = y_valid


In [6]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor() 
_ = forest.fit(X_train, y_train)
forest.score(X_valid, y_valid)

-0.011220303156011191

In [8]:
pd.DataFrame(
    zip(X_train.columns, abs(forest.feature_importances_)), 
    columns=["feature","weight"],).sort_values("weight").reset_index(drop=True)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [9]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=RandomForestRegressor(), n_feature_to_select=10)

_ =rfe.fit(X_train, y_train)

TypeError: __init__() got an unexpected keyword argument 'n_feature_to_select'

In [10]:
X_train.loc[:, rfe.support_]

AttributeError: 'numpy.ndarray' object has no attribute 'loc'

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = train.drop(['loss'], axis=1)
y = train['loss'].values



#no of features
nof_list=np.arange(1,13)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.05, random_state = 0)
    #X_train, X_valid = np.split(train.sample(frac=1, random_state=42),[int(.95*len(train))])
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_valid_rfe = rfe.transform(X_valid)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_valid_rfe,y_valid)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))


In [14]:
#feature selection

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=10, step=1, importance_getter='auto')
rfe_selector.fit(X_train, y_train)

# The best set of features that are selcted, denoted as True, False
print(selector.support_)
# All selected features are marked 1, the unselected redundant features ranked in increasing order
print(selector.ranking_)

# Get a mask, or integer index, of the features selected
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

KeyboardInterrupt: 

# Model

In [None]:
input_shape = [X_train.shape[1]]

model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.4),
    layers.BatchNormalization(),
    layers.Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.4),
    layers.BatchNormalization(),
    layers.Dense(25, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.4),
    layers.BatchNormalization(),
    layers.Dense(1),
])

In [None]:
model.compile(
    optimizer=SGD(lr=0.01),
    loss='mse',
    metrics=[keras.metrics.RootMeanSquaredError()],
)

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=128,
    epochs=50,
    callbacks=[early_stopping],
    verbose=1
)


# Model performance

In [None]:
#RMSE
predicted_train = model.predict(X_train)
predicted_valid = model.predict(X_valid)
rmse_train = metrics.mean_squared_error(y_train, predicted_train, squared=False)
rmse_valid = metrics.mean_squared_error(y_valid, predicted_valid, squared=False)
print('Training RMSE: ', rmse_train)
print('Validation RMSE: ', rmse_valid)

In [None]:
#Loss curves
plt.style.use('seaborn-whitegrid')
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('animation', html='html5')

history_df = pd.DataFrame(history.history)
history_df.loc[0:, ['root_mean_squared_error', 'val_root_mean_squared_error']].plot()
print(("Minimum Root Mean Squared Error: {:0.4f}").format(history_df['root_mean_squared_error'].min()))
print(("Minimum Validation Root Mean Squared Error: {:0.4f}").format(history_df['val_root_mean_squared_error'].min()))

# Prediction

In [None]:
y_pred = model.predict(X_test)

# Submission

In [None]:
preds = pd.read_csv("sample_submission.csv")
preds.loss = y_pred
preds.head()

In [None]:
preds.to_csv('submission14.csv', index=False)