In [584]:
import pandas as pd
import numpy as np
df = pd.read_csv('./input/cars_raw.csv')
df.head()

Unnamed: 0,Year,Make,Model,Used/New,Price,ConsumerRating,ConsumerReviews,SellerType,SellerName,SellerRating,...,InteriorColor,Drivetrain,MinMPG,MaxMPG,FuelType,Transmission,Engine,VIN,Stock#,Mileage
0,2019,Toyota,Sienna SE,Used,"$39,998",4.6,45,Dealer,CarMax Murrieta - Now offering Curbside Pickup...,3.3,...,Black,Front-wheel Drive,19,27,Gasoline,8-Speed Automatic,3.5L V6 24V PDI DOHC,5TDXZ3DC2KS015402,22998646,29403
1,2018,Ford,F-150 Lariat,Used,"$49,985",4.8,817,Dealer,Giant Chevrolet,4.8,...,Black,Four-wheel Drive,19,24,Gasoline,10-Speed Automatic,3.5L V6 24V PDI DOHC Twin Turbo,1FTEW1EG2JFD44217,22418A,32929
2,2017,RAM,1500 Laramie,Used,"$41,860",4.7,495,Dealer,Gill Auto Group Madera,4.6,...,Black,Four-wheel Drive,15,21,Gasoline,8-Speed Automatic,5.7L V8 16V MPFI OHV,1C6RR7VT5HS842283,NG277871G,23173
3,2021,Honda,Accord Sport SE,Used,"$28,500",5.0,36,Dealer,AutoSavvy Las Vegas,4.6,...,–,Front-wheel Drive,29,35,Gasoline,Automatic CVT,1.5L I4 16V GDI DOHC Turbo,1HGCV1F49MA038035,54237,10598
4,2020,Lexus,RX 350,Used,"$49,000",4.8,76,Dealer,Lexus of Henderson,4.8,...,Birch,Front-wheel Drive,20,27,Gasoline,8-Speed Automatic,3.5L V6 24V PDI DOHC,2T2AZMAA8LC156270,HDT4181A,28137


In [585]:
#Remove rows with 'Not Priced'
df = df[df['Price'] != 'Not Priced']

#Remove symbols and convert all entries to integers
df['Price'] = [price.replace('$','') for price in df['Price']]
df['Price'] = [int(price.replace(',','')) for price in df['Price']]
df['Used/New'] = ['Certified Pre-Owned' if 'certified' in item.casefold()
                  else 'Used' for item in df['Used/New']]

df = df[df['Drivetrain'] != '–']

df['Drivetrain'] = ['4WD' if 'four' in item.casefold()
                   else 'AWD' if 'all' in item.casefold()
                   else 'RWD' if 'rear' in item.casefold()
                   else 'FWD' for item in df['Drivetrain']]
df = df[df['FuelType'] != '–']


#One-hot encoding without pd.get_dummies
df['Used/New'] = [1 if item == 'Used' else 0 for item in df['Used/New']]
df['SellerType'] = [1 if item == 'Dealer' else 0 for item in df['SellerType']]

#Modify 'FuelType' column
df['FuelType'] = ['Hybrid' if 'hybrid' in item.casefold()
                 else 'Hybrid' if 'plug' in item.casefold()
                 else 'Flex' if 'flex' in item.casefold()
                 else 'Electric' if 'electric' in item.casefold()
                 else 'Diesel' if 'diesel' in item.casefold()
                 else 'Gas' for item in df['FuelType']]

#Modify 'DealType' column
df['DealType'].fillna('None', inplace=True)
df = df[df['DealType'] != 'None']

#New Columns
df['TransmissionType'] = ['Automatic' if 'auto' in item.casefold()
                         else 'Manual' if 'manual' in item.casefold()
                         else 'Other' for item in df['Transmission']]


make = pd.get_dummies(df['Make'])
drivetrain = pd.get_dummies(df['Drivetrain'])
transmission = pd.get_dummies(df['TransmissionType'],prefix='TM')
fueltype = pd.get_dummies(df['FuelType'],)


columns_to_drop = ['Make','Drivetrain','Transmission','TransmissionType','SellerName',
                  'StreetName','State','Zipcode','FuelType','ExteriorColor','InteriorColor',
                  'Engine','VIN','Stock#','Year','Model']
df.drop(columns_to_drop, axis=1, inplace=True)
df = pd.concat([df,make,drivetrain,transmission,fueltype],axis=1)


In [586]:
#Modeling imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [587]:
#Prepping the data
X = np.array(df.drop('DealType',axis=1))
Y = np.array(df['DealType'])

XTRAIN, XTEST, YTRAIN, YTEST = train_test_split(X,Y,shuffle=True,random_state=69)

In [588]:
#Log Model Import
from sklearn.linear_model import LogisticRegression as LR

In [589]:
#Logistic Regression Model with Scaled Data
scaler = MinMaxScaler()
scaler.fit(XTRAIN)
XTRAIN = scaler.transform(XTRAIN)
XTEST = scaler.transform(XTEST)

lr = LR()
lr.fit(XTRAIN,YTRAIN)
predictions = lr.predict(XTEST)
print(lr.score(XTRAIN,YTRAIN))
print(classification_report(YTEST,predictions))

0.60713763346497
              precision    recall  f1-score   support

        Fair       0.35      0.06      0.10       315
        Good       0.62      0.93      0.74      1394
       Great       0.41      0.09      0.15       570

    accuracy                           0.60      2279
   macro avg       0.46      0.36      0.33      2279
weighted avg       0.53      0.60      0.51      2279



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [590]:
print(confusion_matrix(YTEST,predictions))

[[  19  291    5]
 [  25 1299   70]
 [  10  507   53]]
