# Homework 3 - Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('car_fuel_efficiency.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
df.fillna(0, inplace=True)

## Question 1: What is the mode of the `origin` variable?

In [3]:
df['origin'].mode()[0]

'europe'

## Question 2: What's the median of the `horsepower` variable?

In [4]:
df['horsepower'].median()

np.float64(146.0)

## Data Preparation

In [5]:
df['above_average'] = (df['fuel_efficiency_mpg'] > df['fuel_efficiency_mpg'].mean()).astype(int)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

## Question 3: What's the accuracy on the validation dataset?

In [6]:
numerical = ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year']
dv = DictVectorizer(sparse=False)
train_dict = df_train[numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)
val_dict = df_val[numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict(X_val)
accuracy_score(y_val, y_pred)

0.9366306027820711

## Question 4: What's the least useful feature?

In [7]:
features = numerical
results = {}
for feature in features:
    subset = [f for f in features if f != feature]
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_train, y_train)
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    y_pred = model.predict(X_val)
    results[feature] = accuracy_score(y_val, y_pred)
min(results, key=results.get)

'vehicle_weight'

## Question 5: What's the difference between the sum of misclassifications for the Regularized Logistic Regression model and the Unregularized one?

In [8]:
del df['fuel_efficiency_mpg']
X = df.drop('above_average', axis=1)
y = df['above_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
categorical = list(X_train.dtypes[X_train.dtypes == 'object'].index)
numerical = list(X_train.dtypes[X_train.dtypes != 'object'].index)
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
unregularized_misclassifications = (y_pred != y_test).sum()
results = {}
for C in [0.001, 0.01, 0.1, 1, 10]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    regularized_misclassifications = (y_pred != y_test).sum()
    results[C] = unregularized_misclassifications - regularized_misclassifications
results

{0.001: np.int64(1),
 0.01: np.int64(0),
 0.1: np.int64(0),
 1: np.int64(0),
 10: np.int64(0)}

## Question 6: Which of these models has the best RMSE on the validation set?

In [9]:
results = {}
for alpha in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[alpha] = np.sqrt(mean_squared_error(y_test, y_pred))
min(results, key=results.get)

10