In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import pickle
import warnings
warnings.filterwarnings("ignore")

In [60]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [61]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [62]:
X['education'] = X['education'].replace(['HS-grad','10th','9th'], "High-Scl")
X['education'] = X['education'].replace(['5th-6th','7th-8th'], "Middel-Scl")
X['education'] = X['education'].replace(['1st-4th', 'Preschool'], "Primary-Scl")
X['education'] = X['education'].replace(['11th','12th'], "Higer_sec-Scl")
X['education'] = X['education'].replace(['Some-college','Prof-school', 'Bachelors'], "College")
X['education'] = X['education'].replace(['Assoc-acdm', 'Assoc-voc'],"Other")

In [63]:
X['education'].unique()

array(['College', 'High-Scl', 'Higer_sec-Scl', 'Masters', 'Other',
       'Middel-Scl', 'Doctorate', 'Primary-Scl'], dtype=object)

In [64]:
X["occupation"].unique()

array(['Adm-clerical', 'Exec-managerial', 'Handlers-cleaners',
       'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv', nan], dtype=object)

In [65]:
X.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64

In [67]:
X.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64

In [68]:
X['occupation']=X['occupation'].replace({'?':np.nan})

In [69]:
X["relationship"].unique()

array(['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)

In [70]:
X.isnull().sum()

age                  0
workclass          963
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     274
dtype: int64

In [71]:
X['workclass'] = X['workclass'].replace({'State-gov': 'Govt', 'Federal-gov': 'Govt', 'Local-gov': 'Govt','?':np.nan,'Self-emp-not-inc':'SelfEmployee','Self-emp-inc':'SelfEmployee','Never-worked':'Without-pay'})


In [72]:
X['workclass'].unique()

array(['Govt', 'SelfEmployee', 'Private', nan, 'Without-pay'],
      dtype=object)

In [73]:

X['marital-status']=X['marital-status'].replace({'Never-married': 'Single',
                                                  'Married-spouse-absent': 'Single',
                                                    'Divorced': 'Single',
                                                    'Separated':'Single',
                                                    'Widowed':'Single',
                                                    'Married-civ-spouse':'Couple','Married-AF-spouse':'Couple'})


In [74]:
X['marital-status'].unique()

array(['Single', 'Couple'], dtype=object)

In [75]:

X['native-country']=X['native-country'].replace({'United-States':'US','Cuba':"Non-US", 'Jamaica':"Non-US", 'India':"Non-US", 'Mexico':"Non-US",
       'South':"Non-US", 'Puerto-Rico':"Non-US", 'Honduras':"Non-US", 'England':"Non-US", 'Canada':"Non-US", 'Germany':"Non-US",
       'Iran':"Non-US", 'Philippines':"Non-US", 'Italy':"Non-US", 'Poland':"Non-US", 'Columbia':"Non-US", 'Cambodia':"Non-US",
       'Thailand':"Non-US", 'Ecuador':"Non-US", 'Laos':"Non-US", 'Taiwan':"Non-US", 'Haiti':"Non-US", 'Portugal':"Non-US",
       'Dominican-Republic':"Non-US", 'El-Salvador':"Non-US", 'France':"Non-US", 'Guatemala':"Non-US",
       'China':"Non-US", 'Japan':"Non-US", 'Yugoslavia':"Non-US", 'Peru':"Non-US",
       'Outlying-US(Guam-USVI-etc)':"Non-US", 'Scotland':"Non-US", 'Trinadad&Tobago':"Non-US",
       'Greece':"Non-US", 'Nicaragua':"Non-US", 'Vietnam':"Non-US", 'Hong':"Non-US", 'Ireland':"Non-US", 'Hungary':"Non-US",
       'Holand-Netherlands':"Non-US",'?':np.nan})

In [76]:
X['native-country'].unique()


array(['US', 'Non-US', nan], dtype=object)

In [82]:
for col in X.columns:
    if X[col].dtype == 'object':  
        X[col] = X[col].replace("?", np.nan)   
        X[col] = X[col].fillna(X[col].mode()[0])  
    else:
        X[col] = X[col].fillna(X[col].mode()[0])

In [None]:
X.isnull().sum()

In [77]:
y

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
48837,<=50K.
48838,<=50K.
48839,<=50K.
48840,<=50K.


In [78]:
from sklearn.model_selection import train_test_split,KFold,cross_val_score
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [79]:
from sklearn.linear_model import LogisticRegression
log_model=LogisticRegression()

In [80]:
kfold=KFold(n_splits=5)

In [81]:
score=cross_val_score(log_model,X_train,y_train)


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1247, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\validation.py", line 2971, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\validation.py", line 1368, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\validation.py", line 1053, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\_array_api.py", line 757, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'SelfEmployee'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1247, in fit
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\validation.py", line 2971, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\validation.py", line 1368, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\validation.py", line 1053, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\sklearn\utils\_array_api.py", line 757, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "d:\btch 16\machinlerning_group\vnp\Lib\site-packages\pandas\core\generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'Private'


In [None]:
kfold.mean()