In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data= pd.read_csv("odev_tenis.csv")
data.head()

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes


Preprocessing

In [3]:
#for outlook, there is no meaningfull order among categories one-hot encoding can be applied
#label encoding is applied for "windy" and "play" since we want to encode true as 1, false as 0
from sklearn.preprocessing import LabelEncoder
df= pd.DataFrame(data)

le = LabelEncoder()

df["windy"]=le.fit_transform(df["windy"])
df["play"]=le.transform(df["play"])

#!OneHotEncoder expects a 2d array or dataframe with mutliple columns
#cannot be applied directly to a single column so instead we use get_dummies() from pandas
df=pd.get_dummies(df, columns=["outlook"])
df.head()

Unnamed: 0,temperature,humidity,windy,play,outlook_overcast,outlook_rainy,outlook_sunny
0,85,85,0,1,0,0,1
1,80,90,1,1,0,0,1
2,83,86,0,1,1,0,0
3,70,96,0,1,0,1,0
4,68,80,0,1,0,1,0


Feature Scaling

In [4]:
#the temperature and humidity variables are already in the similar scale standardizin may not be
#necessary, but doing it just to be sure
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()
df[["temperature","humidity"]] = scaler.fit_transform(df[["temperature","humidity"]])
df.head()

Unnamed: 0,temperature,humidity,windy,play,outlook_overcast,outlook_rainy,outlook_sunny
0,1.0,0.645161,0,1,0,0,1
1,0.761905,0.806452,1,1,0,0,1
2,0.904762,0.677419,0,1,1,0,0
3,0.285714,1.0,0,1,0,1,0
4,0.190476,0.483871,0,1,0,1,0


Train test split

In [5]:
#we will be predicting humidity, rest of the variables will be independent variables
from sklearn.model_selection import train_test_split
X=df.drop(columns=["humidity"])
y=df["humidity"]

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.33,random_state=0)
print(y_test)

8     0.161290
6     0.000000
4     0.483871
11    0.806452
2     0.677419
Name: humidity, dtype: float64


    Mutli-linear regression

In [6]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)


performance evaluation

In [7]:
from sklearn.metrics import mean_squared_error, r2_score
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

print("Mse: ",mse, " r2: ",r2)

Mse:  0.23516485691684505  r2:  -1.5403937443467632
