In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.style.use('seaborn-darkgrid')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,precision_score, f1_score, confusion_matrix
df = pd.read_csv('D:/ai project/diabetes_binary_health_indicators_BRFSS2015.csv')
print(f"Data has {df.shape[0]} Rows and {df.shape[1]} Features")
df.head()

df.info()

df.describe()

df.isna().sum()

duplicate = df[df.duplicated()]
print("Duplicate Rows : ",len(duplicate))
duplicate['Diabetes_binary'].value_counts()

df.query('Diabetes_binary==0').drop_duplicates(inplace =True) #because the data is unbalanced

df = df.astype('int') #convert the data to int type because  Integer arithmetic is faster
df['Diabetes_binary'].value_counts(normalize=True)*100

sns.countplot(x='Diabetes_binary',data=df)

#heatmap correlation
plt.figure(figsize = (10,6))
sns.heatmap(df.corr(), vmax = 0.9, square = True)
plt.title("Correlations")
plt.show()


correlation = df.corr()
updatedData = pd.DataFrame();
cnt = 0;
for i in range (len(correlation['Diabetes_binary'])):
    if abs(correlation['Diabetes_binary'][i]) >= 0.05:
        myColumn = df[df.columns[i] ]
        updatedData.insert(cnt,df.columns[i],myColumn)
        cnt = cnt+1
# # Class count
# count_class_0, count_class_1 = updatedData['Diabetes_binary'].value_counts()

# # Divide by class
# df_class_0 = updatedData[updatedData['Diabetes_binary'] == 0]
# df_class_1 = updatedData[updatedData['Diabetes_binary'] == 1]
# # Undersample 0-class and concat the DataFrames of both class
# df_class_0_under = df_class_0.sample(count_class_1)
# updatedData = pd.concat([df_class_0_under, df_class_1], axis=0)

# print('Random under-sampling:')
# print(updatedData['Diabetes_binary'].value_counts())



# # Oversample 1-class and concat the DataFrames of both classes
# df_class_1_over = df_class_1.sample(count_class_0, replace=True)
# df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

# print('Random over-sampling:')
# print(updatedData['Diabetes_binary'].value_counts())
X = updatedData.drop(['Diabetes_binary'],axis=1)
y = updatedData['Diabetes_binary']


from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

y.value_counts()


# Splitting data into training and test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


# Creating StandardScaler instance
sc = StandardScaler()
# Fitting Standard Scaller
X_train = sc.fit_transform(X_train)
# Scaling data
X_test = sc.transform(X_test)


def main(classifier):
  y_train_pred = classifier.predict(X_train)
  y_test_pred = classifier.predict(X_test)

  #Training Data Set
  print("\n-----------TRAINING DATA-----------")
  AccScore = accuracy_score(y_train, y_train_pred)
  F1Score=f1_score(y_train, y_train_pred,average='weighted')
  PrecisionScore = precision_score(y_train, y_train_pred, average='weighted')
  print("Accuracy Score: ",AccScore*100)
  print('Precision Score is : ', PrecisionScore*100)
  print("F1-Score: ",F1Score*100)

  CM = confusion_matrix(y_train, y_train_pred)
  print('Confusion Matrix is : \n', CM)
  # drawing confusion matrix
  plt.figure(figsize = (8,6))
  sns.heatmap(CM, center = True, fmt = ".0f", cmap = 'viridis')
  plt.show()


  print("\n-----------TESTING DATA-----------")
  #Testing Data Set
  AccScore = accuracy_score(y_test, y_test_pred)
  F1Score=f1_score(y_test, y_test_pred,average='weighted')
  PrecisionScore = precision_score(y_test, y_test_pred, average='weighted')
  print("Accuracy Score: ",AccScore*100)
  print('Precision Score is : ', PrecisionScore*100)
  print("F1-Score: ",F1Score*100)

  CM = confusion_matrix(y_test, y_test_pred)
  print('Confusion Matrix is : \n', CM)
  # drawing confusion matrix
  plt.figure(figsize = (8,6))
  sns.heatmap(CM, center = True, fmt = ".0f", cmap = 'viridis')
  plt.show()
#Fitting Logistic Regression Model
classifier = LogisticRegression(C= 100, random_state= 0)
classifier.fit(X_train, y_train)
main(classifier)


classifier = DecisionTreeClassifier(criterion= 'entropy')
classifier = classifier.fit(X_train,y_train)
main(classifier)
print('\n-----------------------------------------------\n')

fun=lambda row: np.round((row * 100),2)
features = fun(classifier.feature_importances_)
def Sort_Tuple(tup):
    return(sorted(tup, key = lambda x: x[1],reverse=True)) 
pd.DataFrame(Sort_Tuple(list(zip(X.columns,features))),columns=['Feature', 'Importance (%)'])

from sklearn.svm import LinearSVC
classifier =LinearSVC( C=100,loss="hinge")
classifier.fit(X_train, y_train)
main(classifier)







