In [150]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
# train test split
from sklearn.model_selection import train_test_split
# temperature prediction
from scipy.stats import uniform, randint
from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, f1_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
from sklearn import preprocessing
from sklearn.metrics import r2_score


# 2 Prepare Data

In [181]:
# read data
train = pd.read_csv('./data/IOT_Train.csv')
data = train
train.head()

Unnamed: 0,mac,station_name,tambon_code,tambon_namt,amphur_code,amphur_namt,province_code,province_namt,latitude,longitude,time,humid,light,pm10,pm2.5,rainfall,wind_direct,wind_speed,temp
0,3C71BF18EA64,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,2022-07-20 15:00:00+07:00,70.6,59.0,,,0.0,45.0,4.3,32.6
1,3C71BF18EA64,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,2022-05-16 09:00:00+07:00,62.3,76.0,,,0.0,45.0,1.9,37.9
2,3C71BF18EA64,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,2022-06-26 19:00:00+07:00,90.3,0.0,,,0.0,135.0,3.2,26.4
3,3C71BF18EA64,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,2022-07-16 04:00:00+07:00,91.7,0.0,,,0.0,157.5,0.0,25.6
4,3C71BF18EA64,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,2022-05-30 08:00:00+07:00,59.1,71.0,,,0.0,180.0,2.3,36.2


In [152]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13635 entries, 0 to 13634
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mac            13635 non-null  object 
 1   station_name   13635 non-null  object 
 2   tambon_code    13635 non-null  int64  
 3   tambon_namt    13635 non-null  object 
 4   amphur_code    13635 non-null  int64  
 5   amphur_namt    13635 non-null  object 
 6   province_code  13635 non-null  int64  
 7   province_namt  13635 non-null  object 
 8   latitude       13635 non-null  float64
 9   longitude      13635 non-null  float64
 10  time           13635 non-null  object 
 11  humid          13635 non-null  float64
 12  light          13635 non-null  float64
 13  pm10           6841 non-null   float64
 14  pm2.5          6841 non-null   float64
 15  rainfall       13635 non-null  float64
 16  wind_direct    13635 non-null  float64
 17  wind_speed     13635 non-null  float64
 18  temp  

In [153]:
# drop row that temp = na
train = train[train['temp'].notna()]
# encode time features change Dtype time from object --> datetime64
train['time'] = pd.to_datetime(train['time'])
# add day in year column
train['day_in_year'] = train['time'].dt.dayofyear
# time in day(hours) from datetime
train['time_in_day'] = train['time'].dt.hour
# drop unused columns
train.drop(['mac', 'time','pm10','pm2.5'], axis=1, inplace=True)
print(train.info())
# drop row that has missing values
print(train.isna().sum())
train.dropna(inplace=True)
print('--------------------------')
print(train.isna().sum())
train=train.reset_index(drop = True)
# x=x.reset_index(drop = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13634 entries, 0 to 13634
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   station_name   13634 non-null  object 
 1   tambon_code    13634 non-null  int64  
 2   tambon_namt    13634 non-null  object 
 3   amphur_code    13634 non-null  int64  
 4   amphur_namt    13634 non-null  object 
 5   province_code  13634 non-null  int64  
 6   province_namt  13634 non-null  object 
 7   latitude       13634 non-null  float64
 8   longitude      13634 non-null  float64
 9   humid          13634 non-null  float64
 10  light          13634 non-null  float64
 11  rainfall       13634 non-null  float64
 12  wind_direct    13634 non-null  float64
 13  wind_speed     13634 non-null  float64
 14  temp           13634 non-null  float64
 15  day_in_year    13634 non-null  int64  
 16  time_in_day    13634 non-null  int64  
dtypes: float64(8), int64(5), object(4)
memory usage: 1

In [154]:
train.head()

Unnamed: 0,station_name,tambon_code,tambon_namt,amphur_code,amphur_namt,province_code,province_namt,latitude,longitude,humid,light,rainfall,wind_direct,wind_speed,temp,day_in_year,time_in_day
0,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,70.6,59.0,0.0,45.0,4.3,32.6,201,15
1,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,62.3,76.0,0.0,45.0,1.9,37.9,136,9
2,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,90.3,0.0,0.0,135.0,3.2,26.4,177,19
3,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,91.7,0.0,0.0,157.5,0.0,25.6,197,4
4,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,59.1,71.0,0.0,180.0,2.3,36.2,150,8


In [155]:
# # Cut out the outliers humid with sklearn
# clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
# y_pred = clf.fit_predict(train[['humid', 'temp']])
# X_scores = clf.negative_outlier_factor_
# outlier_index = np.where(y_pred == -1)
# outlier_index = outlier_index[0]
# print('outliers size', outlier_index.shape[0])
# print('From', train.shape[0], 'to', train.shape[0] - outlier_index.shape[0])
# # drop outliers
# train.drop(outlier_index, inplace=True)
# # plot scatter plot humidity per temperature
# plt.figure(figsize=(10, 10))
# plt.scatter(train['humid'], train['temp'], c=train['temp'], cmap='viridis')
# plt.xlabel('humidity')
# plt.ylabel('temperature')
# plt.title('humidity per temperature')
# plt.show()

In [156]:
# encode the categorical data to numerical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['tambon_namt'] = le.fit_transform(train['tambon_namt'])
train['station_name'] = le.fit_transform(train['station_name'])
train['province_namt'] = le.fit_transform(train['province_namt'])
train['amphur_namt'] = le.fit_transform(train['amphur_namt'])

# simplify feature by dropping out some features
train.drop(['tambon_code', 'amphur_code', 'province_code', 'province_namt', 'station_name'], axis=1, inplace=True)
train.head(2)

Unnamed: 0,tambon_namt,amphur_namt,latitude,longitude,humid,light,rainfall,wind_direct,wind_speed,temp,day_in_year,time_in_day
0,0,7,15.112831,102.052114,70.6,59.0,0.0,45.0,4.3,32.6,201,15
1,0,7,15.112831,102.052114,62.3,76.0,0.0,45.0,1.9,37.9,136,9


In [157]:
# split x , y
y = pd.DataFrame(train['temp'])
x = pd.DataFrame(train.drop(columns='temp'))
print(x)
print(y)

       tambon_namt  amphur_namt   latitude   longitude  humid  light  \
0                0            7  15.112831  102.052114   70.6   59.0   
1                0            7  15.112831  102.052114   62.3   76.0   
2                0            7  15.112831  102.052114   90.3    0.0   
3                0            7  15.112831  102.052114   91.7    0.0   
4                0            7  15.112831  102.052114   59.1   71.0   
...            ...          ...        ...         ...    ...    ...   
13629            6            5  16.494229  104.350891   97.9    0.0   
13630            6            5  16.494229  104.350891   80.6    0.0   
13631            6            5  16.494229  104.350891   87.9   37.0   
13632            6            5  16.494229  104.350891   89.9    0.0   
13633            6            5  16.494229  104.350891   66.3   43.0   

       rainfall  wind_direct  wind_speed  day_in_year  time_in_day  
0           0.0         45.0         4.3          201           15

In [158]:
# correlation
lower = pd.DataFrame(np.tril(x.corr(),-1),columns = x.corr().columns)
to_drop = [column for column in lower.columns if any(lower[column] > 0.8)]
x.drop(columns = to_drop,inplace=True)
to_drop

[]

In [159]:
x

Unnamed: 0,tambon_namt,amphur_namt,latitude,longitude,humid,light,rainfall,wind_direct,wind_speed,day_in_year,time_in_day
0,0,7,15.112831,102.052114,70.6,59.0,0.0,45.0,4.3,201,15
1,0,7,15.112831,102.052114,62.3,76.0,0.0,45.0,1.9,136,9
2,0,7,15.112831,102.052114,90.3,0.0,0.0,135.0,3.2,177,19
3,0,7,15.112831,102.052114,91.7,0.0,0.0,157.5,0.0,197,4
4,0,7,15.112831,102.052114,59.1,71.0,0.0,180.0,2.3,150,8
...,...,...,...,...,...,...,...,...,...,...,...
13629,6,5,16.494229,104.350891,97.9,0.0,4.2,0.0,0.0,165,1
13630,6,5,16.494229,104.350891,80.6,0.0,0.0,135.0,2.9,123,2
13631,6,5,16.494229,104.350891,87.9,37.0,0.0,45.0,3.0,139,6
13632,6,5,16.494229,104.350891,89.9,0.0,0.0,315.0,0.0,166,2


In [160]:
x.loc[[13633]]

Unnamed: 0,tambon_namt,amphur_namt,latitude,longitude,humid,light,rainfall,wind_direct,wind_speed,day_in_year,time_in_day
13633,6,5,16.494229,104.350891,66.3,43.0,0.0,45.0,0.8,156,18


In [161]:
# stadard_scaler data
standard_scaler = preprocessing.StandardScaler()
standard_x = pd.DataFrame(standard_scaler.fit_transform(x),columns= x.columns)
standard_x

Unnamed: 0,tambon_namt,amphur_namt,latitude,longitude,humid,light,rainfall,wind_direct,wind_speed,day_in_year,time_in_day
0,-1.534273,1.523066,-0.164291,0.148356,-0.790050,0.747884,-0.122115,-1.164589,-0.001287,1.302950,0.506518
1,-1.534273,1.523066,-0.164291,0.148356,-1.457154,1.240543,-0.122115,-1.164589,-0.014497,-1.162482,-0.360219
2,-1.534273,1.523066,-0.164291,0.148356,0.793318,-0.961933,-0.122115,-0.267628,-0.007342,0.392637,1.084343
3,-1.534273,1.523066,-0.164291,0.148356,0.905842,-0.961933,-0.122115,-0.043388,-0.024956,1.151231,-1.082499
4,-1.534273,1.523066,-0.164291,0.148356,-1.714351,1.095644,-0.122115,0.180853,-0.012296,-0.631466,-0.504675
...,...,...,...,...,...,...,...,...,...,...,...
13629,1.079858,0.649411,0.424716,1.202804,1.404160,-0.961933,3.653606,-1.613069,-0.024956,-0.062520,-1.515868
13630,1.079858,0.649411,0.424716,1.202804,0.013690,-0.961933,-0.122115,-0.267628,-0.008993,-1.655568,-1.371412
13631,1.079858,0.649411,0.424716,1.202804,0.600420,0.110325,-0.122115,-1.164589,-0.008443,-1.048692,-0.793587
13632,1.079858,0.649411,0.424716,1.202804,0.761168,-0.961933,-0.122115,1.526294,-0.024956,-0.024590,-1.371412


In [162]:
# spliet train and test data
X = standard_x
Y = y
seed = 62
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [163]:
# Create Model List
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR


# regression = { 'LR': LinearRegression(), 'SVR': SVR(), }
# regression = { 'LR': LinearRegression()}
# # Create Parameter Dictionary for Linear Regression
# fit_intercept = [True, False]
# normalize = [True, False]
# params_LR = dict( fit_intercept = fit_intercept, normalize = normalize)
# #Create Parameter Dictionary for SVR
# kernel = ['linear', 'rbf', 'poly']
# C_list = [10, 100]
# C_list = [100]
# ep_list = [0.1, 1, 5]
# ep_list = [0.1]
# gamma = [0.01, 0.1]
# gamma = [0.1]
# degree = [2, 3]       #3x2x3x2x2 = 72
# degree = [2]       #3x2x3x2x2 = 72
# params_SVR = dict( kernel = kernel, C = C_list, epsilon = ep_list, gamma = gamma, degree = degree )

In [164]:
# for EST in regression:
#     model = regression[EST]
#     if (EST == 'LR'):
#         params = params_LR
#     else:
#         params = params_SVR

#     grid = GridSearchCV( 
#     estimator=model,                         #model
#     n_jobs = 8,                              #thread ที่ใช้               
#     verbose = 0,                             #ความละเอียดในการเปิด log
#     cv = 5,                                  #kfold
#     scoring = 'neg_mean_absolute_error',     #neg mean - mse
#     param_grid = params)                     #parameter ที่ใช้กับ model
#     grid_result = grid.fit(x_train, y_train)

In [165]:
# # Show Best Parameters for both models
# print('Best params: ',grid_result.best_params_)
# print('Best score: ', grid_result.best_score_)

In [166]:
model_svr = SVR(kernel = 'rbf', epsilon = 0.1, C = 100 )
model_svr.fit(x_train, y_train)
y_pred = model_svr.predict(x_test)
r2 = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
print('MAE : ',mae)
print('r2 : ',r2)

  y = column_or_1d(y, warn=True)


MAE :  0.7111935634442602
r2 :  0.945530750212812


In [167]:
model_LM = LinearRegression(fit_intercept=True, n_jobs=4, normalize=False)
model_LM.fit(x_train, y_train)
y_pred = model_LM.predict(x_test)
r2 = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
print('MAE : ',mae)
print('r2 : ',r2)

MAE :  1.1219082488508016
r2 :  0.8671351702672988




In [168]:
# import catboost
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(iterations=3000, learning_rate=0.04, depth=9, loss_function='RMSE')
cat_model.fit(x_train, y_train)

0:	learn: 4.2113836	total: 10.7ms	remaining: 32s
1:	learn: 4.0697680	total: 29.3ms	remaining: 43.9s
2:	learn: 3.9288368	total: 33.5ms	remaining: 33.5s
3:	learn: 3.7959514	total: 37.6ms	remaining: 28.2s
4:	learn: 3.6705381	total: 53.8ms	remaining: 32.2s
5:	learn: 3.5490261	total: 58.3ms	remaining: 29.1s
6:	learn: 3.4311670	total: 69.6ms	remaining: 29.7s
7:	learn: 3.3215215	total: 89.8ms	remaining: 33.6s
8:	learn: 3.2139233	total: 111ms	remaining: 36.7s
9:	learn: 3.1102256	total: 129ms	remaining: 38.6s
10:	learn: 3.0111815	total: 141ms	remaining: 38.3s
11:	learn: 2.9170958	total: 146ms	remaining: 36.3s
12:	learn: 2.8280969	total: 151ms	remaining: 34.8s
13:	learn: 2.7428397	total: 156ms	remaining: 33.2s
14:	learn: 2.6584047	total: 162ms	remaining: 32.3s
15:	learn: 2.5765839	total: 167ms	remaining: 31.1s
16:	learn: 2.4987309	total: 171ms	remaining: 30s
17:	learn: 2.4247791	total: 175ms	remaining: 29s
18:	learn: 2.3537210	total: 179ms	remaining: 28s
19:	learn: 2.2867169	total: 182ms	remaini

<catboost.core.CatBoostRegressor at 0x29739db50>

In [169]:
# eval catboost
y_pred = cat_model.predict(x_test)
print('R2: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))


R2:  0.9800526542660357
MAE:  0.45058633179758667


In [182]:
# 0 - 13633
n = 0
row=data.loc[[n]]
print(row)
t=row.temp
preT = X.loc[[n]]
print('---------------------------------------------------------------------')
print('ค่า temp ที่ ถูกต้องใน row ที่',t)
print('---------------------------------------------------------------------')
print('ค่า temp ที่ predict ได้ใน row ที่ ',n,cat_model.predict(preT))


            mac station_name  tambon_code tambon_namt  amphur_code  \
0  3C71BF18EA64     บ้านนา_2       300903       กำปัง         3009   

  amphur_namt  province_code province_namt   latitude   longitude  \
0      โนนไทย             30    นครราชสีมา  15.112831  102.052114   

                        time  humid  light  pm10  pm2.5  rainfall  \
0  2022-07-20 15:00:00+07:00   70.6   59.0   NaN    NaN       0.0   

   wind_direct  wind_speed  temp  
0         45.0         4.3  32.6  
---------------------------------------------------------------------
ค่า temp ที่ ถูกต้องใน row ที่ 0    32.6
Name: temp, dtype: float64
---------------------------------------------------------------------
ค่า temp ที่ predict ได้ใน row ที่  0 [32.53939248]


# ทดลอง model

In [171]:
#Decission Tree regressor
from sklearn.tree import DecisionTreeRegressor #import model
model = DecisionTreeRegressor(max_depth=10, #tune parameter
                           min_samples_leaf=1,
                           random_state=123)
model.fit(x_train, y_train)
x_train = x_train.astype(np.float32)

#Predict y(test_data)
y_pred = model.predict(x_test)
# y_pred = cat_model.predict(x_test)
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('R2: ', r2_score(y_test, y_pred))


MAE:  0.8035214664109366
R2:  0.9348305873331757


In [172]:
# Random Forest parameter
ASM_function = ['entropy','gini']
nEstimator = 100 
nJob = 8
rState =10
# Model Training 
from sklearn.ensemble import RandomForestRegressor
RandomF = RandomForestRegressor(n_estimators=nEstimator,n_jobs=nJob,random_state=rState)
RandomF.fit(x_train,y_train)
#Model Testing
y_pred = RandomF.predict(x_test)
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('R2: ', r2_score(y_test, y_pred))

  RandomF.fit(x_train,y_train)


MAE:  0.6073659699303271
R2:  0.9630127999491233
