In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("AIDS_Classification.csv")

In [3]:
df.shape

(15000, 23)

In [4]:
df.columns

Index(['time', 'trt', 'age', 'wtkg', 'hemo', 'homo', 'drugs', 'karnof',
       'oprior', 'z30', 'preanti', 'race', 'gender', 'str2', 'strat',
       'symptom', 'treat', 'offtrt', 'cd40', 'cd420', 'cd80', 'cd820',
       'infected'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,infected
0,1108,1,37,88.11364,0,1,1,100,0,1,...,1,1,0,0,0,389,320,734,737,1
1,1079,0,43,66.77075,0,0,0,100,0,1,...,0,2,0,1,1,318,432,912,1213,0
2,492,1,34,82.91725,0,0,0,90,0,1,...,1,2,0,1,1,326,524,660,835,0
3,1191,1,41,98.91817,0,0,0,81,0,1,...,1,3,0,1,0,318,232,1131,982,1
4,1141,3,47,53.61717,0,1,0,100,0,0,...,0,1,0,1,0,280,337,515,679,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   time      15000 non-null  int64  
 1   trt       15000 non-null  int64  
 2   age       15000 non-null  int64  
 3   wtkg      15000 non-null  float64
 4   hemo      15000 non-null  int64  
 5   homo      15000 non-null  int64  
 6   drugs     15000 non-null  int64  
 7   karnof    15000 non-null  int64  
 8   oprior    15000 non-null  int64  
 9   z30       15000 non-null  int64  
 10  preanti   15000 non-null  int64  
 11  race      15000 non-null  int64  
 12  gender    15000 non-null  int64  
 13  str2      15000 non-null  int64  
 14  strat     15000 non-null  int64  
 15  symptom   15000 non-null  int64  
 16  treat     15000 non-null  int64  
 17  offtrt    15000 non-null  int64  
 18  cd40      15000 non-null  int64  
 19  cd420     15000 non-null  int64  
 20  cd80      15000 non-null  in

In [7]:
df.isnull().sum()

time        0
trt         0
age         0
wtkg        0
hemo        0
homo        0
drugs       0
karnof      0
oprior      0
z30         0
preanti     0
race        0
gender      0
str2        0
strat       0
symptom     0
treat       0
offtrt      0
cd40        0
cd420       0
cd80        0
cd820       0
infected    0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
df.drop('time',axis=1,inplace=True)

In [10]:
df['infected'].value_counts()

infected
0    10369
1     4631
Name: count, dtype: int64

In [11]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [12]:
X

Unnamed: 0,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,preanti,...,gender,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820
0,1,37,88.11364,0,1,1,100,0,1,169,...,0,1,1,0,0,0,389,320,734,737
1,0,43,66.77075,0,0,0,100,0,1,654,...,1,0,2,0,1,1,318,432,912,1213
2,1,34,82.91725,0,0,0,90,0,1,710,...,1,1,2,0,1,1,326,524,660,835
3,1,41,98.91817,0,0,0,81,0,1,992,...,1,1,3,0,1,0,318,232,1131,982
4,3,47,53.61717,0,1,0,100,0,0,0,...,1,0,1,0,1,0,280,337,515,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,3,36,59.63057,0,1,0,100,0,0,2,...,0,0,1,0,1,1,263,358,2091,608
14996,0,27,102.69575,0,0,0,90,0,0,0,...,1,0,1,0,1,0,327,582,839,392
14997,3,40,79.07121,0,1,1,90,0,1,257,...,1,1,3,0,1,0,283,211,2325,1000
14998,2,40,80.51779,0,0,0,90,0,0,278,...,1,1,3,1,1,0,237,406,560,860


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
from xgboost import XGBClassifier

In [16]:
model = XGBClassifier()

In [17]:
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [20]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6856666666666666
[[1775  299]
 [ 644  282]]
              precision    recall  f1-score   support

           0       0.73      0.86      0.79      2074
           1       0.49      0.30      0.37       926

    accuracy                           0.69      3000
   macro avg       0.61      0.58      0.58      3000
weighted avg       0.66      0.69      0.66      3000



In [21]:
param_grid = {
    'n_estimators': [100, 200, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
}

In [22]:
from sklearn.model_selection import RandomizedSearchCV

In [23]:
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=5, n_jobs=-1)

In [24]:
grid.fit(X_train, y_train)

In [25]:
grid.best_params_

{'n_estimators': 400, 'learning_rate': 0.01}

In [26]:
grid.best_score_

0.7025833333333333

In [27]:
y_pred_grid = grid.predict(X_test)

In [28]:
print(accuracy_score(y_test, y_pred_grid))

0.7076666666666667
