In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor


import warnings
warnings.filterwarnings('ignore')


In [None]:
import xgboost as xgb

In [None]:
df = pd.read_csv('/content/CAR DETAILS FROM CAR DEKHO.csv')

In [None]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [None]:
df.shape

(4340, 8)

In [None]:
df.isna().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(3577, 8)

In [None]:
numerics = ['int8','int16','int32','int64','float16','float32','float64']

categorical_columns = []

features = df.columns.values.tolist()

for col in features:
  if df[col].dtype in numerics:
    continue

  categorical_columns.append(col)  

In [None]:
categorical_columns

['name', 'fuel', 'seller_type', 'transmission', 'owner']

### encoding in categorical columns by using get dummies method

In [None]:
df_dummies = pd.get_dummies(df[categorical_columns],drop_first=True)

In [None]:
df_dummies

Unnamed: 0,name_Ambassador Classic 2000 Dsz,name_Ambassador Grand 1800 ISZ MPFI PW CL,name_Audi A4 1.8 TFSI,name_Audi A4 2.0 TDI,name_Audi A4 2.0 TDI 177 Bhp Premium Plus,name_Audi A4 3.0 TDI Quattro,name_Audi A4 30 TFSI Technology,name_Audi A4 35 TDI Premium,name_Audi A4 35 TDI Premium Plus,name_Audi A4 New 2.0 TDI Multitronic,...,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4337,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,1,0,0
4338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [None]:
df = df.join(df_dummies)

In [None]:
df.shape

(3577, 1509)

In [None]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,name_Ambassador Classic 2000 Dsz,name_Ambassador Grand 1800 ISZ MPFI PW CL,...,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner,0,0,...,0,0,1,1,0,1,0,0,0,0
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner,0,0,...,0,0,1,1,0,1,0,0,0,0
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner,0,0,...,0,0,0,1,0,1,0,0,0,0
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner,0,0,...,0,0,1,1,0,1,0,0,0,0
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner,0,0,...,0,0,0,1,0,1,0,1,0,0


In [None]:
df.drop(columns=categorical_columns,axis=1,inplace=True)

In [None]:
df.shape

(3577, 1504)

In [None]:
df.head()

Unnamed: 0,year,selling_price,km_driven,name_Ambassador Classic 2000 Dsz,name_Ambassador Grand 1800 ISZ MPFI PW CL,name_Audi A4 1.8 TFSI,name_Audi A4 2.0 TDI,name_Audi A4 2.0 TDI 177 Bhp Premium Plus,name_Audi A4 3.0 TDI Quattro,name_Audi A4 30 TFSI Technology,...,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,60000,70000,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
1,2007,135000,50000,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
2,2012,600000,100000,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,2017,250000,46000,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,0
4,2014,450000,141000,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0


In [None]:
df.shape

(3577, 1504)

In [None]:
x = df.drop(['selling_price'],axis=1)
y = df['selling_price']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=10)

**XGB**

In [None]:
import xgboost as xgb
x_gb = xgb.XGBRegressor()

In [None]:
x_gb.fit(x_train,y_train)



XGBRegressor()

In [None]:
from sklearn.metrics import r2_score

In [None]:
y_pred = x_gb.predict(x_test)

In [None]:
r2_score(y_test,y_pred)

0.46199126492016385

over sampling

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
from collections import Counter

In [None]:
sm = SMOTE

In [None]:
data = pd.read_csv('/content/heart_dieseas.csv')

In [None]:
data.shape

(303, 14)

In [None]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [None]:
data['sex'].value_counts()

1    207
0     96
Name: sex, dtype: int64

In [None]:
over_samp = SMOTE(0.75)