In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('L:\solitare infosis\djangowithml\winequality-red.csv')
# top 5 rows
df.head()

In [None]:
# check shape of data
df.shape

In [None]:
df.info()

In [None]:
x=df.drop('quality',axis=1)
y=df['quality']

In [None]:
plt.title('Quality (Dependent Column)')
df['quality'].value_counts().plot.bar();
# this graph shows imbalanced data

In [None]:
# imbalanced data available in quality columns to fix this first
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
smote=SMOTE()
x,y=smote.fit_resample(x,y)
counter = Counter(y)
print(counter)

In [None]:
# after fixed imbalanced data
plt.title('Quality (Dependent Column)')
plt.hist(y);

In [None]:
# label encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit(y)
y=pd.DataFrame(le.transform(y),columns=['quality'])
y


In [None]:
x

In [None]:
# Feature SELECTION 
# interrelationship between feature
plt.figure(figsize=(10,6))
sns.heatmap(x.corr(),annot=True,cmap="YlGnBu");

In [None]:
# correlation with dependent variable
# concat x,y features to obtain correlation
df1=pd.concat([x,y],axis=1)
df1.head()

plt.figure(figsize=(10,6))
sns.heatmap(df1.corr(),annot=True,cmap="YlGnBu");

In [None]:
# drop features
X=df1.drop(['pH','density','total sulfur dioxide','chlorides','volatile acidity','residual sugar','quality'],axis=1)
y=df1['quality']

In [None]:
# kdeplot of all Features
columns=X.columns
for i in columns:
    plt.figure(figsize=(7,4))
    plt.title(i)
    sns.kdeplot(X[i])
plt.show()
    

In [None]:
# transformation of data
# log transformed because right skewed
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X=transformer.fit_transform(X)

In [None]:
#after transform kdeplot of all Features
columns=X.columns
for i in columns:
    plt.figure(figsize=(7,4))
    plt.title(i)
    sns.kdeplot(X[i])
    
plt.show()

In [None]:
# feature scaling
# from sklearn.preprocessing import StandardScaler
# sc=StandardScaler()
# sc.fit(X)
# X=sc.transform(X)

In [None]:
X.shape

In [None]:
# Model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error,accuracy_score

def modelselection(model,x,y):
    xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25)
    model.fit(xtrain,ytrain)
    pred=model.predict(xtest)
    print('score',model.score(xtest,ytest))
    cscore=cross_val_score(model,x,y,cv=5,n_jobs=-1)
    print("cross val score",np.mean(cscore))
    cm=confusion_matrix(pred,ytest)
    print('Confusion matrix',cm)
    print('rmse',np.sqrt(mean_squared_error(pred,ytest)))
    print('accuracy_score',accuracy_score(pred,ytest))
    
    

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(max_iter=1000)
modelselection(lr,X,y)

In [None]:
# decision tree 
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
modelselection(dt,X,y)

In [None]:
# random forest
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
modelselection(rf,X,y)


In [None]:
# xgboost
import xgboost
xg=xgboost.XGBClassifier(use_label_encoder=False)
modelselection(xg,X,y)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et=ExtraTreesClassifier()
modelselection(et,X,y)

In [None]:
# extra tree classifier perform well
# hyperparameter tuning
# from sklearn.model_selection import GridSearchCV
# parameter={
#     'n_estimators':[100,200,90,150,300,500],
#     'criterion':['gini','entropy'],
#     'max_depth': [None,2,8,16,32,50,100],
#     'min_samples_split': [2,4,6],
#     'min_samples_leaf': [1,2],
# }
# gscv=GridSearchCV(et,parameter,cv=10,n_jobs=-1)
# gscv.fit(X,y)

        
   

In [None]:
# after hyperparameter tuning
from sklearn.ensemble import ExtraTreesClassifier
et=ExtraTreesClassifier(max_depth=50,min_samples_split=2)
modelselection(et,X,y)

In [None]:
r=et.predict([transformer.transform(np.array())])

In [None]:
vv=np.array([1.85016,0.217164,2.88427,0.554303,2.701478]).reshape(1,-1)
vv

In [None]:
r=et.predict(transformer.transform(vv))

In [None]:
r

In [None]:
# import pickle
# # save the model to disk
# with open('model_pkl', 'wb') as files:
#     pickle.dump(et, files)

In [2]:
# load saved model
# import pickle
# with open('model_pkl' , 'rb') as f:
#     lr = pickle.load(f)

In [3]:
lr.predict([[1,3,5,3,2]])



array([1], dtype=int64)