In [20]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Sample DataFrame with missing values
data = {
    'manufacturer': ['Ford', 'Toyota', 'Toyota','Honda', 'Ford', np.nan],
    'model': ['Focus', 'Corolla','Camry','Civic', 'sierra 1500 crew cab slt', 'Camry'],
    'year': [2010, 2011,np.nan,2012, None, 2014],
    'price': [10000, 15000,23450, 20000, 12000, np.nan],
    'odometer': [50000, 60000,12355,70000, 80000, 12355]
}

df2 = pd.DataFrame(data)

# Load the dataset
data = pd.read_csv('data/vehicles.csv')

df = data.head(200)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            200 non-null    int64  
 1   region        200 non-null    object 
 2   price         200 non-null    int64  
 3   year          172 non-null    float64
 4   manufacturer  166 non-null    object 
 5   model         171 non-null    object 
 6   condition     166 non-null    object 
 7   cylinders     105 non-null    object 
 8   fuel          173 non-null    object 
 9   odometer      173 non-null    float64
 10  title_status  173 non-null    object 
 11  transmission  173 non-null    object 
 12  VIN           129 non-null    object 
 13  drive         98 non-null     object 
 14  size          21 non-null     object 
 15  type          163 non-null    object 
 16  paint_color   141 non-null    object 
 17  state         200 non-null    object 
dtypes: float64(2), int64(2), objec

In [23]:
# Display the original DataFrame with missing values
print("Original DataFrame with missing values:")
print(df)

# Identify object columns and convert them to categorical type
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    df[col] = df[col].astype('category')

print("\nDataFrame with object columns converted to categorical type:")
print(df)

# Store the original categories
original_categories = {}
for col in object_cols:
    original_categories[col] = df[col].cat.categories

# Encode categorical columns
for col in object_cols:
    df[col] = df[col].cat.codes.replace(-1, np.nan)  # Replace -1 (pandas code for NaN) with np.nan

print("\nDataFrame with encoded categorical columns:")
print(df)

# Initialize the IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Fit and transform the data
df_imputed = imputer.fit_transform(df)

# Convert the numpy array back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print("\nDataFrame after imputation:")
print(df_imputed)

# Convert encoded columns back to their original categories using a safer mapping strategy
for col in object_cols:
    valid_codes = range(len(original_categories[col]))
    df_imputed[col] = df_imputed[col].round().astype(int).map(lambda x: original_categories[col][x] if x in valid_codes else np.nan)

print("\nDataFrame with encoded columns converted back to original categories:")
print(df_imputed)


Original DataFrame with missing values:
             id  region  price    year  manufacturer  model  condition  \
0    7222695916      11   6000     NaN           NaN    NaN        NaN   
1    7218891961       5  11900     NaN           NaN    NaN        NaN   
2    7221797935       6  21000     NaN           NaN    NaN        NaN   
3    7222270760      13   1500     NaN           NaN    NaN        NaN   
4    7210384030       7   4900     NaN           NaN    NaN        NaN   
..          ...     ...    ...     ...           ...    ...        ...   
195  7316711193       2  34995  2012.0           8.0   20.0        NaN   
196  7316707537       2   3200  2005.0           9.0   51.0        2.0   
197  7316702941       2   3495  1999.0           9.0  125.0        2.0   
198  7316695553       2   6500  2002.0           9.0  111.0        2.0   
199  7316694878       2  16988  2008.0          26.0   50.0        NaN   

     cylinders  fuel  odometer  title_status  transmission    VIN  driv

In [19]:
# Display the original DataFrame with missing values
print("Original DataFrame with missing values:")
print(df)

# Identify object columns and convert them to categorical type
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    df[col] = df[col].astype('category')

print("\nDataFrame with object columns converted to categorical type:")
print(df)

# Encode categorical columns
for col in object_cols:
    df[col] = df[col].cat.codes.replace(-1, np.nan)  # Replace -1 (pandas code for NaN) with np.nan

print("\nDataFrame with encoded categorical columns:")
print(df)

# Initialize the IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Fit and transform the data
df_imputed = imputer.fit_transform(df)

# Convert the numpy array back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print("\nDataFrame after imputation:")
print(df_imputed)

# Convert encoded columns back to their original categories
for col in object_cols:
    df_imputed[col] = df_imputed[col].round().astype(int).map(lambda x: original_categories[col][x])

print("\nDataFrame with encoded columns converted back to original categories:")
print(df_imputed)

Original DataFrame with missing values:
             id                  region  price    year manufacturer  \
0    7222695916                prescott   6000     NaN          NaN   
1    7218891961            fayetteville  11900     NaN          NaN   
2    7221797935            florida keys  21000     NaN          NaN   
3    7222270760  worcester / central MA   1500     NaN          NaN   
4    7210384030              greensboro   4900     NaN          NaN   
..          ...                     ...    ...     ...          ...   
195  7316711193              birmingham  34995  2012.0        dodge   
196  7316707537              birmingham   3200  2005.0         ford   
197  7316702941              birmingham   3495  1999.0         ford   
198  7316695553              birmingham   6500  2002.0         ford   
199  7316694878              birmingham  16988  2008.0       toyota   

                 model condition    cylinders fuel  odometer title_status  \
0                  NaN       N

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

KeyError: 'region'

In [2]:
print(df)

   manufacturer  model    year    price  odometer
0           0.0      3  2010.0  10000.0   50000.0
1           2.0      2  2011.0  15000.0   60000.0
2           1.0      1  2012.0  20000.0   70000.0
3           0.0      4     NaN  12000.0   80000.0
4           NaN      0  2014.0      NaN       NaN


In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Sample DataFrame with missing string values
data = {
    'manufacturer': ['Ford', 'Toyota', 'Honda', 'Ford', np.nan],
    'model': ['Focus', 'Corolla', 'Civic', 'sierra 1500 crew cab slt', 'Camry'],
    'year': [2010, 2011, 2012, None, 2014],
    'price': [10000, 15000, 20000, 12000, np.nan],
    'odometer': [50000, 60000, 70000, 80000, np.nan]
}

df = pd.DataFrame(data)

# Display the original DataFrame with missing values
print("Original DataFrame with missing values:")
print(df)

# Identify object columns and convert them to categorical type
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    df[col] = df[col].astype('category')

print("\nDataFrame with object columns converted to categorical type:")
print(df)

# Store the original categories
original_categories = {}
for col in object_cols:
    original_categories[col] = df[col].cat.categories

# Encode categorical columns
for col in object_cols:
    df[col] = df[col].cat.codes.replace(-1, np.nan)  # Replace -1 (pandas code for NaN) with np.nan

print("\nDataFrame with encoded categorical columns:")
print(df)

# Initialize the SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform the data
df_imputed = imputer.fit_transform(df)

# Convert the numpy array back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print("\nDataFrame after imputation:")
print(df_imputed)

# Convert encoded columns back to their original categories
for col in object_cols:
    df_imputed[col] = df_imputed[col].round().astype(int).map(lambda x: original_categories[col][x])

print("\nDataFrame with encoded columns converted back to original categories:")
print(df_imputed)


Original DataFrame with missing values:
  manufacturer                     model    year    price  odometer
0         Ford                     Focus  2010.0  10000.0   50000.0
1       Toyota                   Corolla  2011.0  15000.0   60000.0
2        Honda                     Civic  2012.0  20000.0   70000.0
3         Ford  sierra 1500 crew cab slt     NaN  12000.0   80000.0
4          NaN                     Camry  2014.0      NaN       NaN

DataFrame with object columns converted to categorical type:
  manufacturer                     model    year    price  odometer
0         Ford                     Focus  2010.0  10000.0   50000.0
1       Toyota                   Corolla  2011.0  15000.0   60000.0
2        Honda                     Civic  2012.0  20000.0   70000.0
3         Ford  sierra 1500 crew cab slt     NaN  12000.0   80000.0
4          NaN                     Camry  2014.0      NaN       NaN

DataFrame with encoded categorical columns:
   manufacturer  model    year    pri

In [4]:
print(original_categories)

Index([0, 1, 2, 3, 4], dtype='int8')


In [5]:
print (df)

   manufacturer  model    year    price  odometer
0           0.0      3  2010.0  10000.0   50000.0
1           2.0      2  2011.0  15000.0   60000.0
2           1.0      1  2012.0  20000.0   70000.0
3           0.0      4     NaN  12000.0   80000.0
4           NaN      0  2014.0      NaN       NaN


In [24]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load the dataset
df = pd.read_csv('data/vehicles.csv')

# Display the original DataFrame with missing values
print("Original DataFrame with missing values:")
print(df.head())

# Identify object columns and convert them to categorical type
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    df[col] = df[col].astype('category')

print("\nDataFrame with object columns converted to categorical type:")
print(df.head())

# Store the original categories
original_categories = {}
for col in object_cols:
    original_categories[col] = df[col].cat.categories

# Encode categorical columns
for col in object_cols:
    df[col] = df[col].cat.codes.replace(-1, np.nan)  # Replace -1 (pandas code for NaN) with np.nan

print("\nDataFrame with encoded categorical columns:")
print(df.head())

# Initialize the IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Fit and transform the data
df_imputed = imputer.fit_transform(df)

# Convert the numpy array back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print("\nDataFrame after imputation:")
print(df_imputed.head())

# Convert encoded columns back to their original categories using a safer mapping strategy
for col in object_cols:
    valid_codes = range(len(original_categories[col]))
    df_imputed[col] = df_imputed[col].round().astype(int).map(lambda x: original_categories[col][x] if x in valid_codes else np.nan)

print("\nDataFrame with encoded columns converted back to original categories:")
print(df_imputed.head())


Original DataFrame with missing values:
           id                  region  price  year manufacturer model  \
0  7222695916                prescott   6000   NaN          NaN   NaN   
1  7218891961            fayetteville  11900   NaN          NaN   NaN   
2  7221797935            florida keys  21000   NaN          NaN   NaN   
3  7222270760  worcester / central MA   1500   NaN          NaN   NaN   
4  7210384030              greensboro   4900   NaN          NaN   NaN   

  condition cylinders fuel  odometer title_status transmission  VIN drive  \
0       NaN       NaN  NaN       NaN          NaN          NaN  NaN   NaN   
1       NaN       NaN  NaN       NaN          NaN          NaN  NaN   NaN   
2       NaN       NaN  NaN       NaN          NaN          NaN  NaN   NaN   
3       NaN       NaN  NaN       NaN          NaN          NaN  NaN   NaN   
4       NaN       NaN  NaN       NaN          NaN          NaN  NaN   NaN   

  size type paint_color state  
0  NaN  NaN         NaN   