## Step 1: Define Regression Analysis Calculator

In [13]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

class DecisionTree_Calculator:
    def __init__(self, data):
        self.data = data
        self.y = None
        self.x = None
        print('Decision Tree Calculator Initialized.')

    def define_y_value(self, y_val):
        self.y = self.data[y_val]
        self.x = self.data.drop(columns=[y_val])

    def train_decision_tree(self, regression=True, max_depth=None):
        x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, random_state=0)
        
        if regression:
            model = DecisionTreeRegressor(max_depth=max_depth)
        else:
            model = DecisionTreeClassifier(max_depth=max_depth)
        
        model.fit(x_train, y_train)
        self.model = model
        
        print('Decision Tree Model Trained.')

    def evaluate_model_accuracy(self):
        y_train_pred = self.model.predict(x_train)
        y_test_pred = self.model.predict(x_test)

        # Evaluate metrics
        if isinstance(self.model, DecisionTreeRegressor):
            print('Mean Absolute Error on Test Data:', mean_absolute_error(y_test, y_test_pred))
            print('Mean Squared Error on Test Data:', mean_squared_error(y_test, y_test_pred))
            print('Root Mean Squared Error on Test Data:', np.sqrt(mean_squared_error(y_test, y_test_pred)))
            print('R squared on test set:', r2_score(y_test, y_test_pred))
        else:  # Classification
            # Implement classification metrics if needed
            pass
        
        # Plotting if needed (for regression)
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_test_pred, color='blue')
        plt.xlabel('True Values')
        plt.ylabel('Predicted Values')
        plt.title('True vs Predicted Values')
        plt.show()


## Step 2: Prepare Data
- Read Data
- Preprocess Data 
- Change Any Columns with 0/1s to Object Data Types

In [14]:
import pandas as pd
import os

data = pd.read_csv(os.path.join(os.getcwd(),'data_clean.csv'))

In [15]:
data.head()
data.dtypes

manufacturer            object
model                   object
year                     int64
mileage                float64
engine                  object
transmission            object
drivetrain              object
fuel_type               object
mpg                     object
max_mpg                float64
exterior_color          object
interior_color          object
accidents_or_damage    float64
one_owner              float64
personal_use_only      float64
seller_name             object
seller_rating          float64
price_drop             float64
price                  float64
dtype: object

In [16]:
data

Unnamed: 0,manufacturer,model,year,mileage,engine,transmission,drivetrain,fuel_type,mpg,max_mpg,exterior_color,interior_color,accidents_or_damage,one_owner,personal_use_only,seller_name,seller_rating,price_drop,price
0,Acura,ILX Hybrid 1.5L,2013,92945.0,"1.5L I-4 i-VTEC variable valve control, engine...",Automatic,Front-wheel Drive,Gasoline,39-38,38.0,Black,Parchment,0.0,0.0,0.0,Iconic Coach,0.0,300.0,13988.0
1,Acura,ILX Hybrid 1.5L,2013,47645.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,38.0,Gray,Ebony,1.0,1.0,1.0,Kars Today,0.0,0.0,17995.0
2,Acura,ILX Hybrid 1.5L,2013,53422.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,38.0,Bellanova White Pearl,Ebony,0.0,1.0,1.0,Weiss Toyota of South County,4.3,500.0,17000.0
3,Acura,ILX Hybrid 1.5L,2013,62042.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,38.0,Polished Metal Metallic,Ebony,0.0,0.0,1.0,Kalidy Kia,2.2,109.0,18000.0
4,Acura,ILX Hybrid 1.5L,2013,57212.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,38.0,Silver,Ebony,0.0,1.0,1.0,Ohio Car Mart,0.0,0.0,15999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566811,Volvo,S60 T5,2020,26781.0,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Front-wheel Drive,Gasoline,23-34,34.0,Crystal White Pearl Metallic,Maroon Brown,1.0,1.0,1.0,Jenkins Volvo of Ocala,5.0,1108.0,30883.0
566812,Volvo,S60 B5 Momentum,2022,22877.0,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,All-wheel Drive,Gasoline,25-33,33.0,Bright Silver Metallic,Blond,0.0,1.0,0.0,Volvo Cars Danbury,4.2,250.0,34798.0
566813,Volvo,S60 T5,2014,92000.0,2.5L I5 20V MPFI DOHC Turbo,6-Speed Automatic,Front-wheel Drive,Gasoline,21-30,30.0,Ice White,Soft Beige,0.0,0.0,1.0,Dapper Car Sales,0.0,300.0,12299.0
566814,Volvo,S60 T5 Platinum,2013,132000.0,2.5L I5 20V MPFI DOHC Turbo,6-Speed Automatic,All-wheel Drive,Gasoline,20-29,29.0,Ice White,Off Black,1.0,0.0,0.0,Legend Auto Sales Inc.,4.6,1000.0,8995.0


## Step 3: Perform Regressions

In [17]:
# Assuming 'data' is your pandas DataFrame with appropriate columns
calculator = DecisionTree_Calculator(data)

# Define which column is the target variable
calculator.define_y_value('price')

# Train the decision tree model (regression by default, set regression=False for classification)
calculator.train_decision_tree()

# Evaluate model accuracy
calculator.evaluate_model_accuracy()


Decision Tree Calculator Initialized.


ValueError: could not convert string to float: 'Toyota'