In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data=pd.read_csv('/kaggle/input/drug-classification/drug200.csv')
data.head()

In [3]:
data.shape

In [4]:
data.info()

In [5]:
data.isna().sum()

In [6]:
data.duplicated().sum()

In [7]:
data.describe()

In [8]:
profile= pandas_profiling.ProfileReport(data)
profile.to_file('file.html')

profile

In [9]:
fig = px.bar(data, x="Drug", y="Na_to_K", color='Cholesterol',barmode="group")

fig.show()

In [10]:
fig = px.bar(data, x="Drug", y="Na_to_K", color='BP',barmode="group")

fig.show()

In [11]:
fig = px.bar(data, x="Drug", y="Age", color='BP',barmode="group")

fig.show()

In [12]:
wordCloud = WordCloud(
    background_color='black',
    max_font_size = 50).generate(' '.join(data['Drug']))

plt.figure(figsize=(15,7))
plt.axis('off')
plt.imshow(wordCloud)
plt.show()

In [13]:
fig=px.scatter(data,x='Na_to_K',y='Age')
fig.show()

In [14]:
#mapp={'LOW':0,'HIGH':1}
#data['Cholesterol']=data['Cholesterol'].map(mapp)
#data['Cholesterol']=data['Cholesterol'].fillna(0)
#data['Cholesterol']=data['Cholesterol'].astype(int)

In [15]:
bin_age = [0, 19, 29, 39, 49, 59, 69, 80]
category_age = ['<20s', '20s', '30s', '40s', '50s', '60s', '>60s']
data['Age_binned'] = pd.cut(data['Age'], bins=bin_age, labels=category_age)
data= data.drop(['Age'], axis = 1)

In [16]:
bin_NatoK = [0, 9, 19, 29, 50]
category_NatoK = ['<10', '10-20', '20-30', '>30']
data['Na_to_K_binned'] = pd.cut(data['Na_to_K'], bins=bin_NatoK, labels=category_NatoK)
data = data.drop(['Na_to_K'], axis = 1)

In [17]:
data.head()

In [18]:
data.info()

In [19]:
X=data.drop('Drug',axis=1)
y=data['Drug']

In [20]:
from sklearn.model_selection import train_test_split ,cross_val_score,RandomizedSearchCV
X_train ,X_test,y_train ,y_test =train_test_split(X,y,test_size =.3 , random_state=0)

In [21]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [22]:
from imblearn.over_sampling import SMOTE
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [23]:
sns.set_theme(style="darkgrid")
sns.countplot(y=y_train, data=data, palette="mako_r")
plt.ylabel('Drug Type')
plt.xlabel('Total')
plt.show()

In [24]:
from sklearn.linear_model import LogisticRegression ,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB,GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report


In [25]:
log = LogisticRegression(solver='liblinear', max_iter=5000)
log.fit(X_train, y_train)

y_pred = log.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
LRAcc = accuracy_score(y_pred,y_test)
print('Logistic Regression accuracy is: {:.2f}%'.format(LRAcc*100))

In [26]:
scoreListknn = []
for i in range(1,30):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    scoreListknn.append(knn.score(X_test, y_test))
    
plt.plot(range(1,30), scoreListknn)
plt.xticks(np.arange(1,30,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()
KNAccMax = max(scoreListknn)
print("KNN Acc Max {:.2f}%".format(KNAccMax*100))

In [27]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
KNAcc = accuracy_score(y_pred,y_test)
print('K Neighbours accuracy is: {:.2f}%'.format(KNAcc*100))

In [28]:
scoreListDT = []
for i in range(2,50):
    dc = DecisionTreeClassifier(max_leaf_nodes=i)
    dc.fit(X_train, y_train)
    scoreListDT.append(dc.score(X_test, y_test))
    
plt.plot(range(2,50), scoreListDT)
plt.xticks(np.arange(2,50,5))
plt.xlabel("Leaf")
plt.ylabel("Score")
plt.show()
DTAccMax = max(scoreListDT)
print("DT Acc Max {:.2f}%".format(DTAccMax*100))

In [29]:
dc = DecisionTreeClassifier(max_leaf_nodes=20)
dc.fit(X_train, y_train)

y_pred = dc.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
DTAcc = accuracy_score(y_pred,y_test)
print('Decision Tree accuracy is: {:.2f}%'.format(DTAcc*100))

In [30]:
scoreListRF = []
for i in range(2,50):
    rf = RandomForestClassifier(n_estimators = 1000, random_state = 1, max_leaf_nodes=i)
    rf.fit(X_train, y_train)
    scoreListRF.append(rf.score(X_test, y_test))
    
plt.plot(range(2,50), scoreListRF)
plt.xticks(np.arange(2,50,5))
plt.xlabel("RF Value")
plt.ylabel("Score")
plt.show()
RFAccMax = max(scoreListRF)
print("RF Acc Max {:.2f}%".format(RFAccMax*100))

In [31]:
rf = RandomForestClassifier(max_leaf_nodes=30)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
RFAcc = accuracy_score(y_pred,y_test)
print('Random Forest accuracy is: {:.2f}%'.format(RFAcc*100))

In [32]:
svc = SVC(kernel='linear', max_iter=251)
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
SVCAcc = accuracy_score(y_pred,y_test)
print('SVC accuracy is: {:.2f}%'.format(SVCAcc*100))

In [33]:
nbg = GaussianNB()
nbg.fit(X_train, y_train)

y_pred = nbg.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
NBAcc2 = accuracy_score(y_pred,y_test)
print('Gaussian Naive Bayes accuracy is: {:.2f}%'.format(NBAcc2*100))

In [34]:
nbc = CategoricalNB()
nbc.fit(X_train, y_train)

y_pred = nbc.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
NBAcc1 = accuracy_score(y_pred,y_test)
print('Naive Bayes accuracy is: {:.2f}%'.format(NBAcc1*100))

In [35]:
compare = pd.DataFrame({'Model': ['Logistic Regression', 'K Neighbors', 'K Neighbors Max', 'SVM', 'Categorical NB', 'Gaussian NB', 'Decision Tree', 'Decision Tree Max', 'Random Forest', 'Random Forest Max'], 
                        'Accuracy': [LRAcc*100, KNAcc*100, KNAccMax*100, SVCAcc*100, NBAcc1*100, NBAcc2*100, DTAcc*100, DTAccMax*100, RFAcc*100, RFAccMax*100]})
compare.sort_values(by='Accuracy', ascending=False)