In [288]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

In [289]:
df = pd.read_csv('data.csv')

In [290]:
df.head()

Unnamed: 0,ILLUM,HUMID,CO2,SOUND,TEMP,RYTHM,ID
0,379.52,75.96,563.78,24.74,20.3,82.36,3120
1,411.15,54.17,510.42,35.28,20.25,79.93,3121
2,335.73,63.25,466.68,37.46,20.89,89.4,3122
3,350.15,71.9,451.37,21.01,19.3,77.67,3123
4,505.77,66.61,541.35,35.35,21.0,82.66,3124


In [291]:
df.shape

(3200, 7)

In [292]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3200 entries, 0 to 3199
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ILLUM   3200 non-null   float64
 1   HUMID   3200 non-null   float64
 2   CO2     3200 non-null   float64
 3   SOUND   3200 non-null   float64
 4   TEMP    3200 non-null   float64
 5   RYTHM   3200 non-null   float64
 6   ID      3200 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 175.1 KB


In [293]:
df.isnull().sum()

ILLUM    0
HUMID    0
CO2      0
SOUND    0
TEMP     0
RYTHM    0
ID       0
dtype: int64

In [294]:
ids = df['ID']
ids.duplicated().sum()

0

In [295]:
px.box(df, y='ILLUM')

In [296]:
px.box(df, y='HUMID')

In [297]:
px.box(df, y='CO2')

In [298]:
px.box(df, y='SOUND')

In [299]:
px.box(df, y='TEMP')

In [300]:
df.loc[(df['ILLUM'] < 199.38) | (df['ILLUM'] > 684.35), 'ILLUM'] = df['ILLUM'].median()
df.loc[(df['HUMID'] < 54.01) | (df['HUMID'] > 80.05), 'HUMID'] = df['HUMID'].median()
df.loc[(df['CO2'] < 383.54) | (df['CO2'] > 617.69), 'CO2'] = df['CO2'].median()
df.loc[(df['SOUND'] < 13.07) | (df['SOUND'] > 46.45), 'SOUND'] = df['SOUND'].median()
df.loc[(df['TEMP'] < 15.57) | (df['TEMP'] > 24.5), 'TEMP'] = df['TEMP'].median()

In [301]:
# df.loc[(df['ILLUM'] < 199.38) | (df['ILLUM'] > 684.35), 'ILLUM'] = df['ILLUM'].mean()
# df.loc[(df['HUMID'] < 54.01) | (df['HUMID'] > 80.05), 'HUMID'] = df['HUMID'].mean()
# df.loc[(df['CO2'] < 383.54) | (df['CO2'] > 617.69), 'CO2'] = df['CO2'].mean()
# df.loc[(df['SOUND'] < 13.07) | (df['SOUND'] > 46.45), 'SOUND'] = df['SOUND'].mean()
# df.loc[(df['TEMP'] < 15.57) | (df['TEMP'] > 24.5), 'TEMP'] = df['TEMP'].mean()

In [302]:
ILLUM_values = df.loc[:, 'ILLUM'].values
ILLUM_values

array([379.52, 411.15, 335.73, ..., 413.62, 487.22, 481.05])

In [303]:
HUMID_values = df.loc[:, 'HUMID'].values
HUMID_values

array([75.96, 54.17, 63.25, ..., 72.42, 71.4 , 66.25])

In [304]:
CO2_values = df.loc[:, 'CO2'].values
CO2_values

array([563.78, 510.42, 466.68, ..., 532.05, 611.28, 528.06])

In [305]:
SOUND_values = df.loc[:, 'SOUND'].values
SOUND_values

array([24.74, 35.28, 37.46, ..., 21.69, 20.31, 25.49])

In [306]:
TEMP_values = df.loc[:, 'TEMP'].values
TEMP_values

array([20.3 , 20.25, 20.89, ..., 17.46, 19.22, 20.99])

In [307]:
y = df.loc[:, 'RYTHM'].values
y

array([82.36, 79.93, 89.4 , ..., 75.67, 85.68, 75.  ])

In [308]:
np.corrcoef(ILLUM_values, y)

array([[1.        , 0.04624605],
       [0.04624605, 1.        ]])

In [309]:
np.corrcoef(HUMID_values, y)

array([[ 1.        , -0.02637475],
       [-0.02637475,  1.        ]])

In [310]:
np.corrcoef(CO2_values, y)

array([[1.        , 0.26373932],
       [0.26373932, 1.        ]])

In [311]:
np.corrcoef(SOUND_values, y)

array([[1.        , 0.16148928],
       [0.16148928, 1.        ]])

In [312]:
np.corrcoef(TEMP_values, y)

array([[1.        , 0.24856499],
       [0.24856499, 1.        ]])

In [313]:
X = df.drop(columns =['ID', 'HUMID', 'RYTHM']).values
X

array([[379.52, 563.78,  24.74,  20.3 ],
       [411.15, 510.42,  35.28,  20.25],
       [335.73, 466.68,  37.46,  20.89],
       ...,
       [413.62, 532.05,  21.69,  17.46],
       [487.22, 611.28,  20.31,  19.22],
       [481.05, 528.06,  25.49,  20.99]])

In [314]:
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
# X

In [315]:
# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# X

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [317]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [318]:
score = lr.score(X_test, y_test)
score


0.2569421880321672

In [319]:
1 - ( 1-score ) * ( len(y_test) - 1 ) / ( len(y_test) - X_test.shape[1] - 1 )

0.2475065332770201

In [320]:
mean_absolute_error(y_test, lr.predict(X_test))

4.519219919999707

In [332]:
answers = pd.read_csv('answers.csv')
answers.head()

to_predict = answers.drop(columns =['ID', 'HUMID', 'RYTHM']).values
result = lr.predict(to_predict)

In [335]:
answers['RYTHM'] = result
answers.head()

Unnamed: 0,ID,ILLUM,HUMID,CO2,SOUND,TEMP,RYTHM
0,0,337.9,63.29,518.13,22.38,18.59,76.978088
1,1,342.96,63.19,488.85,37.36,20.41,80.687376
2,2,369.86,57.39,516.86,26.45,20.89,80.672351
3,3,304.65,67.8,517.5,32.98,22.52,83.359283
4,4,442.25,68.21,500.91,37.28,21.88,83.781831


In [336]:
answers.to_csv('result.csv', index=False)

In [321]:
# !pip install h2o
# import h2o
# from h2o.automl import H2OAutoML
# h2o.init()

In [322]:
# data = h2o.import_file("data.csv")
# data.describe()

In [323]:
# train, valid = data.split_frame(ratios = [.8])

In [324]:
# x = data.columns
# y = "RYTHM"
# x.remove(y)
# x.remove("ID")

In [325]:
# aml = H2OAutoML(max_models=30, seed=1)
# aml.train(x = x, y = y, training_frame = train, validation_frame = valid)

In [326]:
# aml.leader.r2()

In [327]:
# aml.leader.r2(valid=True)

In [328]:
# aml.leader.mae(valid=True)