In [46]:
import numpy as np
import pandas as pd

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns

from sklearn.metrics import r2_score

## Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.

In [48]:
df=pd.read_csv("adult.csv")
df

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [49]:
df.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [50]:
col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
             'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']

df.columns = col_names

df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [123]:
df.drop('workclass', axis=1, inplace=True)

In [124]:
df.head()

Unnamed: 0,age,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
1,38,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
5,49,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K


In [125]:
df.isna().sum()

age               0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32559 entries, 1 to 32559
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32559 non-null  int64 
 1   fnlwgt          32559 non-null  int64 
 2   education       32559 non-null  object
 3   education_num   32559 non-null  int64 
 4   marital_status  32559 non-null  object
 5   occupation      32559 non-null  object
 6   relationship    32559 non-null  object
 7   race            32559 non-null  object
 8   sex             32559 non-null  object
 9   capital_gain    32559 non-null  int64 
 10  capital_loss    32559 non-null  int64 
 11  hours_per_week  32559 non-null  int64 
 12  native_country  32559 non-null  object
 13  income          32559 non-null  object
dtypes: int64(6), object(8)
memory usage: 4.7+ MB


In [127]:
df.isnull().sum()

age               0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [128]:
df.isnull().sum()

age               0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [130]:
# check labels in workclass variable


In [131]:
df.occupation.value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4065
 Adm-clerical         3769
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

In [132]:
# check labels in native_country variable

df.native_country.unique()


array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [133]:
# check frequency distribution of values in native_country variable

df.native_country.value_counts()
#583 unknown

 United-States                 29168
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [134]:
df=df[~(df['native_country']=="?")]
df['native_country'].value_counts()


 United-States                 29168
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [135]:
df['occupation']=df['occupation'].replace("?",np.NaN)
df['occupation'].value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4065
 Adm-clerical         3769
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

In [137]:
X = df.drop(['income'], axis=1)

y = df['income']

In [138]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [139]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22791, 13), (9768, 13), (22791,), (9768,))

In [140]:
X_train.dtypes

age                int64
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
dtype: object

In [141]:
X_train.isnull().mean()

age               0.0
fnlwgt            0.0
education         0.0
education_num     0.0
marital_status    0.0
occupation        0.0
relationship      0.0
race              0.0
sex               0.0
capital_gain      0.0
capital_loss      0.0
hours_per_week    0.0
native_country    0.0
dtype: float64

In [143]:
X_test.shape, X_train.shape

((9768, 13), (22791, 13))

In [144]:
si= SimpleImputer(strategy='most_frequent')
X_train[['occupation','native_country']]=si.fit_transform(X_train[['occupation','native_country']])
X_test[['occupation','native_country']]=si.fit_transform(X_test[['occupation','native_country']])
X_test[['occupation','native_country']].shape

(9768, 2)

In [145]:
ohe=OneHotEncoder(sparse=False)
ohe.fit([['education', 'marital_status', 'occupation', 'relationship', 
                                 'race', 'sex', 'native_country']])



cat = [1,3,5,6,7,8,9,13]
trf2  = ColumnTransformer([
    ("onehot_categorical", OneHotEncoder(sparse = False,handle_unknown='ignore'), cat)],
    remainder= "passthrough")

In [146]:
X_train_transformed=ohe.fit_transform(X_train[['education', 'marital_status', 'occupation', 'relationship', 
                                 'race', 'sex', 'native_country']])
X_test_transformed=ohe.fit_transform(X_test[['education', 'marital_status', 'occupation', 'relationship', 
                                 'race', 'sex', 'native_country']])
X_test_transformed.shape



(9768, 93)

In [147]:
X_train_transformed.shape

(22791, 92)

In [148]:
X_train_numeric=X_train.drop(columns=['education', 'marital_status', 'occupation', 'relationship', 
                                 'race', 'sex', 'native_country']).values
X_test_numeric=X_test.drop(columns=['education', 'marital_status', 'occupation', 'relationship', 
                                 'race', 'sex', 'native_country']).values
X_test_numeric.shape

(9768, 6)

In [149]:
X_train_numeric.shape

(22791, 6)

In [150]:
X_train_transformed

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [151]:
df.columns

Index(['age', 'fnlwgt', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income'],
      dtype='object')

In [152]:
X_train_transform=np.concatenate((X_train_transformed,X_train_numeric),axis=1)
X_test_transform=np.concatenate((X_test_transformed,X_test_numeric),axis=1)
X_train_transform.shape

(22791, 98)

In [153]:
X_test_transform.shape

(9768, 99)

In [154]:
X_train_transform[1]

array([0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.000

In [155]:
#from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_transform = sc.fit_transform(X_train_transform)
X_test_transform= sc.fit_transform(X_test_transform)

In [156]:
X_test_transform

array([[-0.17398583, -0.18874165, -0.11970889, ..., -0.14377092,
        -0.21494696, -0.03624826],
       [-0.17398583, -0.18874165, -0.11970889, ..., -0.14377092,
        -0.21494696, -0.03624826],
       [-0.17398583, -0.18874165, -0.11970889, ..., -0.14377092,
        -0.21494696,  0.76609258],
       ...,
       [-0.17398583, -0.18874165, -0.11970889, ...,  1.9645913 ,
        -0.21494696,  0.36492216],
       [-0.17398583, -0.18874165, -0.11970889, ..., -0.14377092,
        -0.21494696, -0.03624826],
       [-0.17398583, -0.18874165, -0.11970889, ..., -0.14377092,
        -0.21494696, -0.03624826]])

In [157]:
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train_transform, y_train)

In [None]:
y_pred = gnb.predict(X_test_transform)

y_pred

In [None]:
f=len(y_pred)
f

In [160]:
y_test

31037     <=50K
8951       >50K
7839       >50K
31511      >50K
19140     <=50K
          ...  
32327     <=50K
11469      >50K
26404      >50K
15301     <=50K
26822     <=50K
Name: income, Length: 9768, dtype: object

In [161]:
pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})

NameError: name 'y_pred' is not defined

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) 

In [None]:
y_pred_train = gnb.predict(X_train)

y_pred_train

In [None]:
y_test.value_counts()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
y_pred_prob = gnb.predict_proba(X_test)[0:10]

y_pred_prob

In [None]:
data = pd.read_csv("Ecommerce Customers")

# covid 

In [None]:
df = pd.read_csv("covid_toy.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=["has_covid"]),df["has_covid"],test_size=0.2)

In [None]:
x_train

In [None]:
df.city.value_counts()


In [None]:
si= SimpleImputer()
x_train_fever=si.fit_transform(x_train[["fever"]])
x_test_fever=si.fit_transform(x_test[["fever"]])
x_train_fever.shape

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder(categories=[['Mild','Strong']])
x_train_cough=oe.fit_transform(x_train[['cough']])
x_test_cough=oe.fit_transform(x_test[['cough']])
x_train_cough.shape




In [None]:
ohe=OneHotEncoder(sparse=False)
x_train_gender_city=ohe.fit_transform(x_train[['gender','city']])
x_test_gender_city=ohe.fit_transform(x_test[['gender','city']])
x_train_gender_city.shape

In [None]:
x_train_gender_city

In [None]:
x_train_age=x_train.drop(columns=["gender","fever","cough","city"]).values
x_test_age=x_test.drop(columns=["gender","fever","cough","city"]).values
x_train_age.shape

In [None]:
x_train_transformed=np.concatenate((x_train_age,x_train_fever,x_train_gender_city,x_train_cough),axis=1)
x_test_transformed=np.concatenate((x_test_age,x_test_fever,x_test_gender_city,x_test_cough),axis=1)
x_train_transformed.shape

In [None]:
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(x_train_transformed, y_train)

In [None]:
y_pred = gnb.predict(x_test_transformed)

y_pred

In [None]:
pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) 

In [None]:
y_pred_train = gnb.predict(x_train_transformed)

y_pred_train

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred_train) 