In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict, cross_val_score

In [2]:
data = pd.read_csv("Cleaned_Data.csv")

In [3]:
data_set = data

In [4]:
data_label = data["price"]

In [5]:
data_set.drop("price", axis = 1, inplace = True)

# KNN Algorithm

In [34]:
from sklearn.neighbors import KNeighborsRegressor

In [39]:
model = KNeighborsRegressor(n_neighbors=100, weights = "distance")

In [40]:
predictions = cross_val_predict(model, data_set, data_label, cv = 4)

In [41]:
predictions

array([28374.12743203, 29345.        , 15822.67317568, ...,
       20133.82625231, 26417.79817381, 31131.85440396])

In [42]:
data_label

0         26650.0
1         29345.0
2         13950.0
3         31028.0
4         21525.0
           ...   
199995    16005.0
199996    25597.0
199997    16935.0
199998    22499.0
199999    25765.0
Name: price, Length: 200000, dtype: float64

In [17]:
from sklearn.metrics import r2_score

In [43]:
print(r2_score(data_label, predictions))

0.8190872820118816


# Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
reg = LinearRegression()

In [29]:
reg.fit(data_set, data_label)

LinearRegression()

In [25]:
predictions = cross_val_predict(reg, data_set, data_label, cv = 4)

In [33]:
predictions

array([32943.875  , 29361.09375, 19687.4375 , ..., 19733.4375 ,
       36528.4375 , 24321.6875 ])

In [17]:
from sklearn.metrics import r2_score


In [32]:
print(r2_score(data_label, predictions))

0.889036506572124


# Decision Trees

In [44]:
from sklearn.tree import DecisionTreeRegressor

In [48]:
decision_tree_model = DecisionTreeRegressor(random_state=0)

In [49]:
predictions = cross_val_predict(decision_tree_model,data_set, data_label, cv = 4 )

In [50]:
predictions

array([28725., 29345., 13800., ..., 22980., 21495., 25765.])

In [51]:
data_label

0         26650.0
1         29345.0
2         13950.0
3         31028.0
4         21525.0
           ...   
199995    16005.0
199996    25597.0
199997    16935.0
199998    22499.0
199999    25765.0
Name: price, Length: 200000, dtype: float64

In [53]:
print(r2_score(data_label, predictions))

0.8804149647651485


# Random Forests

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
random_forest_model = RandomForestRegressor(max_depth=10, random_state=0)

In [8]:
predictions = cross_val_predict(random_forest_model,data_set, data_label, cv = 4 )

In [9]:
predictions

array([29541.95236569, 35356.2740839 , 17133.31612873, ...,
       20045.9105605 , 18106.85587147, 24724.35973115])

In [13]:
print(r2_score(data_label, predictions))

0.8677915344883855


# Artificial Neural Networks

In [9]:
from sklearn.neural_network import MLPRegressor

In [10]:
from sklearn.datasets import make_regression

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
regr = MLPRegressor(random_state=1, max_iter=100, hidden_layer_sizes =[10,10 ])

In [13]:
predictions = cross_val_predict(regr, data_set, data_label, cv = 4)



In [14]:
predictions

array([28587.62727203, 30037.20464684, 15526.84749083, ...,
       19074.00684495, 23659.78978713, 24219.92181695])

In [15]:
data_label

0         26650.0
1         29345.0
2         13950.0
3         31028.0
4         21525.0
           ...   
199995    16005.0
199996    25597.0
199997    16935.0
199998    22499.0
199999    25765.0
Name: price, Length: 200000, dtype: float64

In [18]:
print(r2_score(data_label, predictions))

0.9032072769036771


# Logistic Regression

Since our problem is a regression problem, we needed to bin the data to be able to apply logisitc regression on the data. We will bin data according to the percentiles

In [3]:
pd.qcut(data['price'], q=10)

0         (24499.0, 27400.4]
1         (27400.4, 30999.0]
2         (10995.0, 15900.0]
3         (30999.0, 37354.0]
4         (18900.0, 21605.0]
                 ...        
199995    (15900.0, 18900.0]
199996    (24499.0, 27400.4]
199997    (15900.0, 18900.0]
199998    (21605.0, 24499.0]
199999    (24499.0, 27400.4]
Name: price, Length: 200000, dtype: category
Categories (10, interval[float64, right]): [(348.999, 10995.0] < (10995.0, 15900.0] < (15900.0, 18900.0] < (18900.0, 21605.0] ... (27400.4, 30999.0] < (30999.0, 37354.0] < (37354.0, 46804.0] < (46804.0, 849988.0]]

In [4]:
data["temp_bins"] = pd.qcut(data['price'], q=10)
data.columns

Index(['back_legroom', 'city_fuel_economy', 'daysonmarket',
       'engine_displacement', 'franchise_dealer', 'front_legroom',
       'fuel_tank_volume', 'height', 'highway_fuel_economy', 'horsepower',
       ...
       ''501A Mid Equipment Group'', ''Appearance and Protection Package'',
       ''5th Wheel'', ''Quiet Package'', ''Graphics Package'',
       ''King Ranch Package'', ''Z 71 Package'',
       ''601A Luxury Equipment Group'', 'Distances', 'temp_bins'],
      dtype='object', length=1700)

In [5]:
lis = []

for i in range(0, len(data)):
    #print(i)
    lis.append(str(data.loc[i]["temp_bins"]))
lis

['(24499.0, 27400.4]',
 '(27400.4, 30999.0]',
 '(10995.0, 15900.0]',
 '(30999.0, 37354.0]',
 '(18900.0, 21605.0]',
 '(15900.0, 18900.0]',
 '(18900.0, 21605.0]',
 '(18900.0, 21605.0]',
 '(348.999, 10995.0]',
 '(348.999, 10995.0]',
 '(348.999, 10995.0]',
 '(46804.0, 849988.0]',
 '(27400.4, 30999.0]',
 '(348.999, 10995.0]',
 '(27400.4, 30999.0]',
 '(46804.0, 849988.0]',
 '(18900.0, 21605.0]',
 '(15900.0, 18900.0]',
 '(30999.0, 37354.0]',
 '(24499.0, 27400.4]',
 '(24499.0, 27400.4]',
 '(18900.0, 21605.0]',
 '(27400.4, 30999.0]',
 '(37354.0, 46804.0]',
 '(30999.0, 37354.0]',
 '(21605.0, 24499.0]',
 '(21605.0, 24499.0]',
 '(46804.0, 849988.0]',
 '(18900.0, 21605.0]',
 '(24499.0, 27400.4]',
 '(27400.4, 30999.0]',
 '(18900.0, 21605.0]',
 '(37354.0, 46804.0]',
 '(37354.0, 46804.0]',
 '(10995.0, 15900.0]',
 '(348.999, 10995.0]',
 '(10995.0, 15900.0]',
 '(27400.4, 30999.0]',
 '(18900.0, 21605.0]',
 '(18900.0, 21605.0]',
 '(24499.0, 27400.4]',
 '(10995.0, 15900.0]',
 '(46804.0, 849988.0]',
 '(1099

In [6]:
len(lis)

200000

In [7]:
data["price_bins"] = lis

In [8]:
data.drop("temp_bins", inplace = True, axis = 1)

In [9]:
data.drop("price", axis = 1, inplace = True)

In [10]:
data_set = data

In [11]:
data_label = data_set["price_bins"]

In [12]:
data_set.drop("price_bins", axis = 1, inplace = True)

In [13]:
data_label

0         (24499.0, 27400.4]
1         (27400.4, 30999.0]
2         (10995.0, 15900.0]
3         (30999.0, 37354.0]
4         (18900.0, 21605.0]
                 ...        
199995    (15900.0, 18900.0]
199996    (24499.0, 27400.4]
199997    (15900.0, 18900.0]
199998    (21605.0, 24499.0]
199999    (24499.0, 27400.4]
Name: price_bins, Length: 200000, dtype: object

In [14]:
from sklearn.linear_model import LogisticRegression

In [25]:
model = LogisticRegression(max_iter = 500, random_state =0)

In [26]:
predictions = cross_val_predict(model, data_set, data_label, cv = 4)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [27]:
predictions

array(['(27400.4, 30999.0]', '(30999.0, 37354.0]', '(10995.0, 15900.0]',
       ..., '(15900.0, 18900.0]', '(30999.0, 37354.0]',
       '(24499.0, 27400.4]'], dtype=object)

In [29]:
data_label

0         (24499.0, 27400.4]
1         (27400.4, 30999.0]
2         (10995.0, 15900.0]
3         (30999.0, 37354.0]
4         (18900.0, 21605.0]
                 ...        
199995    (15900.0, 18900.0]
199996    (24499.0, 27400.4]
199997    (15900.0, 18900.0]
199998    (21605.0, 24499.0]
199999    (24499.0, 27400.4]
Name: price_bins, Length: 200000, dtype: object

In [30]:
from sklearn.metrics import precision_score

In [31]:
print(precision_score(data_label, predictions, average = "macro"))

0.610105857686245


In [32]:
from sklearn.metrics import recall_score

In [33]:
print(recall_score(data_label, predictions, average = "macro"))

0.6124395844591326


In [34]:
from sklearn.metrics import accuracy_score

In [35]:
print(accuracy_score(data_label, predictions))

0.61263


# Naive Bayes Classifier

In [67]:
from sklearn.naive_bayes import GaussianNB

In [68]:
model = GaussianNB()

In [69]:
predictions = cross_val_predict(model, data_set, data_label, cv= 4)

In [70]:
predictions

array(['(37354.0, 46804.0]', '(30999.0, 37354.0]', '(37354.0, 46804.0]',
       ..., '(21605.0, 24499.0]', '(46804.0, 849988.0]',
       '(27400.4, 30999.0]'], dtype='<U19')

In [71]:
print(precision_score(data_label, predictions, average = "macro"))

0.40080034207592447


In [72]:
print(recall_score(data_label, predictions, average = "macro"))

0.31601451978188444


In [73]:
print(accuracy_score(data_label, predictions))

0.31585
