In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler,PowerTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
import pickle
import warnings
warnings.filterwarnings('ignore')

In [72]:
df=pd.read_csv('forest_outliers_removed1')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [5]:
df.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [7]:
df['Classes']=df['Classes'].apply(lambda x:1 if x=='fire' else 0)

In [8]:
X=df.drop(columns=['Temperature','year'],axis=1)
y=df['Temperature']

In [14]:
scale=ColumnTransformer(transformers=[
    ('scale',PowerTransformer(),slice(0,11))
],remainder='passthrough')
tnf = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse=False, drop='first'), [11])
], remainder='passthrough')


In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.3,random_state=256)

In [16]:
pipe=Pipeline([
    ('scale',scale),
    ('model',RandomForestRegressor(random_state=42))
])
pipe.fit(X_train.values,y_train.values)
y_pred=pipe.predict(X_test.values)
print('R2 score: ',r2_score(y_test,y_pred))
print('MAE: ',mean_absolute_error(y_test,y_pred))

R2 score:  0.6834306722329265
MAE:  1.6848538011695904


In [36]:
pipe=Pipeline([
    ('scale',scale),
    ('model',RandomForestRegressor(random_state=42,n_jobs=-1))
])
# Number of trees in random forest
n_estimators = [100, 200, 300, 1000]
# Maximum number of levels in tree
max_depth = [80, 90, 100, 110]
max_depth.append(None)
min_samples_leaf =[3, 4, 5]
# Method of selecting samples for training each tree
criterion =['mse', 'mae']
max_features=[2,3]
random_grid={
    'model__n_estimators':n_estimators,
    'model__max_depth':max_depth,
    'model__min_samples_leaf':min_samples_leaf
}

In [37]:
gs=GridSearchCV(estimator=pipe,param_grid=random_grid,n_jobs=-1)

In [38]:
gs

In [33]:
gs.best_scores_

AttributeError: 'GridSearchCV' object has no attribute 'best_scores_'

In [39]:
gs.fit(X_train,y_train)

In [40]:
gs.best_params_

{'model__max_depth': 110,
 'model__min_samples_leaf': 3,
 'model__n_estimators': 1000}

In [41]:
gs.best_score_

0.39895987582177594

# Linear Regression

In [42]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=256)

In [43]:
pipe=Pipeline([
    ('tnf',tnf),
    ('scale',scale),
    ('model',LinearRegression(fit_intercept=True))
])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
print("R2 SCORE: ",r2_score(y_test,y_pred))
print("MAE: ",mean_absolute_error(y_test,y_pred))

R2 SCORE:  0.5918778282630223
MAE:  1.8196039542139892


In [44]:
##ridge
pipe=Pipeline([
    ('tnf',tnf),
    ('scale',scale),
    ('model',Ridge(fit_intercept=True))
])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
print("R2 SCORE: ",r2_score(y_test,y_pred))
print("MAE: ",mean_absolute_error(y_test,y_pred))

R2 SCORE:  0.6450344702764037
MAE:  1.6974444591238829


In [45]:
##lasso
pipe=Pipeline([
    ('tnf',tnf),
    ('scale',scale),
    ('model',Lasso(fit_intercept=True))
])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
print("R2 SCORE: ",r2_score(y_test,y_pred))
print("MAE: ",mean_absolute_error(y_test,y_pred))

R2 SCORE:  0.4081281553409586
MAE:  2.1836026814260188


In [60]:
##svr
pipe=Pipeline([
    ('tnf',tnf),
    ('scale',scale),
    ('model',SVR(C=1,kernel='linear',gamma='auto'))
])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
print("R2 SCORE: ",r2_score(y_test,y_pred))
print("MAE: ",mean_absolute_error(y_test,y_pred))

R2 SCORE:  0.6762028880370541
MAE:  1.6019435840932388


In [61]:
rf_params = {
    'C': [1,10, 100],
    "kernel":['poly','rbf','sigmoid'],
    "epsilon":[0.01,0.1,1]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10,)

In [65]:
##kNearest neighbors
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',KNeighborsRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5573551804205851
MAE 1.8865497076023394


In [66]:
rf_params = {
    'n_neighbors': [2, 3, 5, 7, 10]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10)

In [67]:
##decision tree regressor
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',DecisionTreeRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5071984416312796
MAE 2.0409356725146197


In [69]:
results = pd.DataFrame({
    'Model': ['Linear Regression','Lasso Regression', 'Ridge Regression','SVR' ,'Decision Tree','Random Forest'],
    'Score': [0.63,0.51,0.63,0.61,0.56,0.68]})

result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.68,Random Forest
0.63,Linear Regression
0.63,Ridge Regression
0.61,SVR
0.56,Decision Tree
0.51,Lasso Regression


conclusion:our best model is random forest regressor

In [73]:

pickle.dump(pipe,open('pipe_reg1.pkl','wb'))

In [74]:
import json
result = X_test.to_json(orient="records")
parsed = json.loads(result)

In [75]:
!pip install pymongo



In [76]:
import certifi
import pymongo
client = pymongo.MongoClient("mongodb+srv://tallojiharshith:tallojiharshith@cluster0.pyu7rs3.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",tlsCAFile=certifi.where())

In [77]:
db = client.batch_data
print(db)

Database(MongoClient(host=['ac-pqgccm2-shard-00-01.pyu7rs3.mongodb.net:27017', 'ac-pqgccm2-shard-00-00.pyu7rs3.mongodb.net:27017', 'ac-pqgccm2-shard-00-02.pyu7rs3.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='Cluster0', authsource='admin', replicaset='atlas-x2ptah-shard-0', tls=True, tlscafile='C:\\Anaconda\\Lib\\site-packages\\certifi\\cacert.pem'), 'batch_data')


In [78]:
coll = db['regression_batch_data']

In [79]:
db.list_collection_names()

[]

In [80]:
coll.insert_many(parsed)

InsertManyResult([ObjectId('664caa440e2410da8438faf9'), ObjectId('664caa440e2410da8438fafa'), ObjectId('664caa440e2410da8438fafb'), ObjectId('664caa440e2410da8438fafc'), ObjectId('664caa440e2410da8438fafd'), ObjectId('664caa440e2410da8438fafe'), ObjectId('664caa440e2410da8438faff'), ObjectId('664caa440e2410da8438fb00'), ObjectId('664caa440e2410da8438fb01'), ObjectId('664caa440e2410da8438fb02'), ObjectId('664caa440e2410da8438fb03'), ObjectId('664caa440e2410da8438fb04'), ObjectId('664caa440e2410da8438fb05'), ObjectId('664caa440e2410da8438fb06'), ObjectId('664caa440e2410da8438fb07'), ObjectId('664caa440e2410da8438fb08'), ObjectId('664caa440e2410da8438fb09'), ObjectId('664caa440e2410da8438fb0a'), ObjectId('664caa440e2410da8438fb0b'), ObjectId('664caa440e2410da8438fb0c'), ObjectId('664caa440e2410da8438fb0d'), ObjectId('664caa440e2410da8438fb0e'), ObjectId('664caa440e2410da8438fb0f'), ObjectId('664caa440e2410da8438fb10'), ObjectId('664caa440e2410da8438fb11'), ObjectId('664caa440e2410da8438fb

In [81]:
pickle_model = pickle.load(open('pipe_reg2.pkl','rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'pipe_reg2.pkl'

In [82]:
test_input = np.array([1,6,57,18.0,0.00,65.7000,3.4,7.6,1.3,3.4,0.5,0],dtype=object).reshape(1,12)

In [83]:
pipe.predict(test_input)

array([30.])