In [28]:
import pandas as pd
import optuna
from optuna.samplers import TPESampler
import matplotlib
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.express as px

We have 52 columns with int and float values, let's look at the data

In [47]:
data = pd.read_csv('internship_train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 54 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       90000 non-null  int64  
 1   1       90000 non-null  int64  
 2   2       90000 non-null  int64  
 3   3       90000 non-null  int64  
 4   4       90000 non-null  int64  
 5   5       90000 non-null  int64  
 6   6       90000 non-null  float64
 7   7       90000 non-null  float64
 8   8       90000 non-null  int64  
 9   9       90000 non-null  int64  
 10  10      90000 non-null  int64  
 11  11      90000 non-null  int64  
 12  12      90000 non-null  int64  
 13  13      90000 non-null  float64
 14  14      90000 non-null  float64
 15  15      90000 non-null  float64
 16  16      90000 non-null  float64
 17  17      90000 non-null  float64
 18  18      90000 non-null  float64
 19  19      90000 non-null  float64
 20  20      90000 non-null  float64
 21  21      90000 non-null  float64
 22

In [48]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,target
0,236,488,16,221,382,97,-4.472136,0.107472,0,132,...,13.340874,0.870542,1.962937,7.466666,11.547794,8.822916,9.046424,7.895535,11.010677,20.107472
1,386,206,357,232,1,198,7.81025,0.763713,1,143,...,12.484882,7.16868,2.885415,12.413973,10.260494,10.091351,9.270888,3.173994,13.921871,61.763713
2,429,49,481,111,111,146,8.602325,0.651162,1,430,...,14.030257,0.39497,8.160625,12.592059,8.937577,2.265191,11.255721,12.794841,12.080951,74.651162
3,414,350,481,370,208,158,8.306624,0.424645,1,340,...,2.789577,6.416708,10.549814,11.456437,6.468099,2.519049,0.258284,9.317696,5.383098,69.424645
4,318,359,20,218,317,301,8.124038,0.767304,1,212,...,1.88656,1.919999,2.268203,0.149421,4.105907,10.416291,6.816217,8.58696,4.512419,66.767304


Here we can see, that we have a couple of similar columns by their statistics, we also have just one column with negative values(6) and one categorical column(8). 

In [46]:
data.describe().T[['mean', 'std', 'min', 'max']]

Unnamed: 0,mean,std,min,max
0,249.423944,144.336393,0.0,499.0
1,250.236267,144.0222,0.0,499.0
2,248.637289,144.107577,0.0,499.0
3,249.7366,144.284945,0.0,499.0
4,249.436178,143.941581,0.0,499.0
5,249.656167,144.329168,0.0,499.0
6,-0.011402,7.038171,-9.949874,9.949874
7,0.498548,0.288682,1.4e-05,0.999987
8,0.499189,0.500002,0.0,1.0
9,249.842033,144.612718,0.0,499.0


Let's try to build a simple model and look at its score and into feature importance

In [49]:
data = pd.read_csv('internship_train.csv')
y = data['target']

X = data.drop(['target'], axis = 1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [50]:
model = XGBRegressor(
    seed=0
)
model.fit(X_train, 
    y_train, 
    verbose=True)

predictions = model.predict(X_test)

print(np.sqrt(mean_squared_error(predictions, y_test)))

0.03728025640948753


Now we can see, that the most valuable column is 6 and we should also look at 7 column, because it also has a little importance, all other features can be dropped

In [37]:
def plot_features(booster):    
    importance = pd.DataFrame({'importance': model.feature_importances_*100, 'feature' : data.drop('target', axis=1).columns})
    fig = px.bar(importance.sort_values(by='importance', ascending=True), 
                 x = 'feature', y = 'importance')
    fig.show()
  
print(*[{index: val} for index, val in enumerate(model.feature_importances_) if val > 0.00001])
plot_features(model)

{6: 0.99989176} {7: 7.972622e-05}


So let's now look again at the data, but only at 6, 7 and target columns. But previoulsy let's sort target for a better representation

In [58]:
data.sort_values(by=['target'], axis = 0)[['6','7','target']][:15]

Unnamed: 0,6,7,target
68617,-0.0,0.002634,0.002634
41272,-0.0,0.002749,0.002749
34873,-0.0,0.003404,0.003404
58854,0.0,0.004395,0.004395
49569,-0.0,0.005463,0.005463
41288,0.0,0.008721,0.008721
60319,-0.0,0.009273,0.009273
48430,0.0,0.014745,0.014745
45866,0.0,0.016212,0.016212
71284,-0.0,0.017221,0.017221


Now we can clearly see, that the fractional part is equal to 7 column and integer part - 6th column to square 

In [59]:
data.sort_values(by=['target'], axis = 0)[['6','7','target']][20000:20015]

Unnamed: 0,6,7,target
8889,-4.690416,0.200876,22.200876
60058,4.690416,0.201416,22.201416
44255,-4.690416,0.202935,22.202935
48282,-4.690416,0.203897,22.203897
46959,4.690416,0.204015,22.204015
40195,-4.690416,0.20636,22.20636
66652,4.690416,0.20673,22.20673
70224,-4.690416,0.206916,22.206916
58035,4.690416,0.207501,22.207501
89253,-4.690416,0.208917,22.208917


Let's now check the theory

In [63]:
def predict(df):
    return df['6']**2 + df['7']

Now we have much better result

In [64]:
sample_data = data.sample(10000)

predictions = predict(sample_data)

print(np.sqrt(mean_squared_error(predictions, sample_data['target'])))

1.3475705967581302e-14


So now we can make predictions and send our targets to competition for verification

In [65]:
test = pd.read_csv('internship_hidden_test.csv')
predictions = predict(test)
pred_transpose = predictions.T
np.savetxt("target.csv", pred_transpose, delimiter=",")