In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
wd = "/content/drive/My Drive/Colab/setik/"

In [4]:
df = pd.read_csv(wd + "dataset/01-remove-features.csv")

# Data

In [5]:
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,posting_date
0,6000,,,,,,,,,,,,
1,11900,,,,,,,,,,,,
2,21000,,,,,,,,,,,,
3,1500,,,,,,,,,,,,
4,4900,,,,,,,,,,,,


In [6]:
print("dataset missing values\n")

for i in df.columns.values.tolist():
    n_miss = df[i].isna().sum()
    perc = n_miss / df.shape[0] * 100
    print(f"{i} : {n_miss} ({round(perc, 2)}%)")

dataset missing values

price : 0 (0.0%)
year : 1205 (0.28%)
manufacturer : 17646 (4.13%)
model : 5277 (1.24%)
condition : 174104 (40.79%)
cylinders : 177678 (41.62%)
fuel : 3013 (0.71%)
odometer : 4400 (1.03%)
transmission : 2556 (0.6%)
drive : 130567 (30.59%)
type : 92858 (21.75%)
paint_color : 130203 (30.5%)
posting_date : 68 (0.02%)


# Split Data

In [7]:
from sklearn.model_selection import train_test_split

y = df.price
X = df.drop(['price'], axis =1 )

# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [8]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = []
categorical_columns = []

for col in X_train.columns.values.tolist():
    if df[col].dtype in numerics:
      numerical_columns.append(col)
    else:
      categorical_columns.append(col)

In [9]:
print("X_train missing values\n")

columns_type = [
    ["numerical_columns", numerical_columns],
    ["categorical_columns", categorical_columns]
]

for i in columns_type:
  print(i[0])

  for j in i[1]:
    n_miss = X_train[j].isna().sum()
    perc = n_miss / df.shape[0] * 100
    print(f"{j} : {n_miss} ({round(perc, 2)}%)")

  print()

X_train missing values

numerical_columns
year : 961 (0.23%)
odometer : 3557 (0.83%)

categorical_columns
manufacturer : 14080 (3.3%)
model : 4187 (0.98%)
condition : 139045 (32.57%)
cylinders : 142163 (33.3%)
fuel : 2384 (0.56%)
transmission : 2043 (0.48%)
drive : 104575 (24.5%)
type : 74294 (17.4%)
paint_color : 104272 (24.43%)
posting_date : 60 (0.01%)



# Handling X_train Numerical Missing Values

In [10]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_imputer = IterativeImputer(random_state=0)
iter_imputed = iter_imputer.fit_transform(X_train[numerical_columns])
df_iter_imputed = pd.DataFrame(iter_imputed, columns=[col for col in X_train[numerical_columns].columns])

# Handling X_train Categorical Missing Values

In [11]:
from sklearn.preprocessing import OneHotEncoder

oh_enc = OneHotEncoder(handle_unknown='ignore')
oh_encoded = pd.DataFrame(oh_enc.fit_transform(X_train[categorical_columns]).toarray())

In [None]:
cat_iter_imputer = IterativeImputer(random_state=0)
cat_iter_imputed = cat_iter_imputer.fit_transform(oh_encoded)
df_cat_iter_imputed = pd.DataFrame(iter_imputed, columns=[col for col in X_train[categorical_columns].columns])

In [None]:
return
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_columns),
    ('cat', categorical_transformer, categorical_columns)
])

In [None]:
return
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10, random_state=0)

In [None]:
return
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)