In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [46]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# if 'id' in train.columns: train_df = train.drop(columns=['id'])

print(train.head())

    id        first    second     third       label
0   16   621.022461 -2.341116  2.219315   45.386553
1   18   591.331994  1.586732  1.459008   62.487210
2   10   540.675613  0.491446  1.167377   39.327242
3  111  1994.352060 -3.207705  1.264370  199.909617
4  101  1986.083550 -5.137871  1.713998  196.248352


In [47]:
features = ['first', 'second', 'third']

In [49]:
# Test model

X = train[features]
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE : {mse}")
print(f"RMSE : {rmse}")
print(f"MAE : {mae}")
print(f"R2 : {r2}")

MSE : 1346.3432404691519
RMSE : 36.69255020394674
MAE : 29.251315290557336
R2 : 0.648621220379622


In [50]:
# Run test

model = LinearRegression()
model.fit(train[features], train['label'])

preds = model.predict(test[features])

submission = pd.DataFrame({
    'id': test['id'],
    'label': preds
})

print(submission.head())

submission.to_csv('submission.csv', index=False)

    id       label
0  108  189.170245
1   67  133.865333
2   31   78.655389
3  119  204.099021
4   42  127.976179
