In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
import pickle

In [8]:
file_path = "Outlier_removed.csv"
df = pd.read_csv(file_path)
df['Classes']=df['Classes'].apply(lambda x :1 if x == 'fire' else 0)

In [9]:
X = df.drop(columns=['Temperature','year'],axis=1)
y = df['Temperature']

In [10]:
scale = ColumnTransformer(transformers=[
    ('scale', PowerTransformer(),slice(0,11))
],remainder='passthrough')
tnf = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse=False, drop= 'first'),[11])
],remainder='passthrough')



## Linear Regression

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6274467795991874
MAE 1.6555082527105962


## Ridge Regression

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6274467795991874
MAE 1.6555082527105962


## Lasso Regression

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=728)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Lasso(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.4288385456284802
MAE 2.0427523025556718


## SVR

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=480)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',SVR(C=1,kernel='linear',gamma='auto',max_iter=5e4))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6160776443762215
MAE 1.831236127863886


## K Neighbors Regressor

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',KNeighborsRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5492459133457336
MAE 1.8514619883040935


## Decision Tree Regressor

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',DecisionTreeRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.3598262301715661
MAE 2.0760233918128654


## Random Forest Regression

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=256)
pipe = Pipeline([
        ('scale',scale),
        ('model',RandomForestRegressor(random_state=96))
    ])
pipe.fit(X_train.values, y_train.values)
y_pred = pipe.predict(X_test.values)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6745237475900951
MAE 1.6928070175438594


In [24]:
pipe = Pipeline([
    ('scale',scale),
    ('model',RandomForestRegressor(random_state=96,n_jobs=-1))
])
# Number of trees in random forest
n_estimators = [100, 200, 300, 1000]
# Maximum number of levels in tree
max_depth = [80, 90, 100, 110]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [8, 10, 12],
# Minimum number of samples required at each leaf node
min_samples_leaf =[3, 4, 5]
# Method of selecting samples for training each tree
criterion =['mse', 'mae']
max_features=[2,3]
# Create the random grid
random_grid = {'model__n_estimators': n_estimators,
               'model__max_depth': max_depth,
               'model__min_samples_split': min_samples_split,
               'model__min_samples_leaf': min_samples_leaf,
               'model__max_features': max_features 
            }
pipe.fit(X_train.values, y_train.values)
y_pred = pipe.predict(X_test.values)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6745237475900951
MAE 1.6928070175438594


In [20]:
results = pd.DataFrame({
    'Model': ['Linear Regression','Lasso Regression', 
              'Ridge Regression','SVR' ,'Decision Tree','Random Forest'],
    'Score': [0.62,0.42,0.62,0.61,0.35,0.67]})

result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.7,Random Forest
0.63,Linear Regression
0.63,Ridge Regression
0.61,SVR
0.56,Decision Tree
0.51,Lasso Regression


In [25]:
### Creating pickle file
pickle.dump(pipe,open('../pipe_reg1.pkl','wb'))

In [26]:


### Preparing data to create batch prediction
import json
result = X_test.to_json(orient="records")
parsed = json.loads(result)



In [29]:


# Testing created Pipe
pickle_model = pickle.load(open('pipe_reg1.pkl','rb'))
test_input = np.array([1,6,57,18.0,0.00,65.7000,3.4,7.6,1.3,3.4,0.5,0],dtype=object).reshape(1,12)
pipe.predict(test_input)

array([29.21])