In [1]:
import pandas as pd

In [2]:
#reading the csv file. Data can be extracted from database using sql as well
vehicles = pd.read_csv('clean_vehicles_data.csv')

In [3]:
#Implementing Linear Regression Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Function to preprocess, train the model, and predict
def preprocess_and_predict(df, target_column):
    # Splitting into features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # One-hot encode categorical features
    X = pd.get_dummies(X, drop_first=True)

    # Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Training a Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Making predictions
    y_pred = model.predict(X_test)

    # Evaluating the model
    print(f"Evaluation for {target_column}:")
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared Score:", r2_score(y_test, y_pred))
    print()
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return model, X_train, mse, r2

In [4]:
# Assuming your data is in a DataFrame called 'v'
prices = vehicles['price']

# Calculate Q1, Q3, and IQR
Q1 = prices.quantile(0.25)
Q3 = prices.quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Count outliers
outliers = prices[(prices < lower_bound) | (prices > upper_bound)]
print(f"Number of outliers: {len(outliers)}")

# Remove outliers
v_cleaned = vehicles[(vehicles['price'] >= lower_bound) & (vehicles['price'] <= upper_bound)]

print(f"Original dataset size: {len(vehicles)}")
print(f"Cleaned dataset size: {len(v_cleaned)}")

Number of outliers: 7564
Original dataset size: 390008
Cleaned dataset size: 382444


In [5]:
v_cleaned.dropna()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,type,paint_color,description,state
0,auburn,33590,2014.0,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,other,3GTP1VEC4EG551563,4wd,pickup,white,Carvana is the safer way to buy a car During t...,al
1,auburn,22590,2010.0,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,other,1GCSCSE06AZ123805,4wd,pickup,blue,Carvana is the safer way to buy a car During t...,al
2,auburn,39590,2020.0,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,other,3GCPWCED5LG130317,4wd,pickup,red,Carvana is the safer way to buy a car During t...,al
3,auburn,30990,2017.0,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,other,5TFRM5F17HX120972,4wd,pickup,red,Carvana is the safer way to buy a car During t...,al
5,auburn,27990,2012.0,gmc,sierra 2500 hd extended cab,good,8 cylinders,gas,68696.0,clean,other,1GT220CG8CZ231238,4wd,pickup,black,Carvana is the safer way to buy a car During t...,al
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390003,wyoming,23590,2019.0,nissan,maxima s sedan 4d,good,6 cylinders,gas,32226.0,clean,other,1N4AA6AV6KC367801,fwd,sedan,white,Carvana is the safer way to buy a car During t...,wy
390004,wyoming,30590,2020.0,volvo,s60 t5 momentum sedan 4d,good,4 cylinders,gas,12029.0,clean,other,7JR102FKXLG042696,fwd,sedan,red,Carvana is the safer way to buy a car During t...,wy
390005,wyoming,34990,2020.0,cadillac,xt4 sport suv 4d,good,4 cylinders,diesel,4174.0,clean,other,1GYFZFR46LF088296,4wd,hatchback,white,Carvana is the safer way to buy a car During t...,wy
390006,wyoming,28990,2018.0,lexus,es 350 sedan 4d,good,6 cylinders,gas,30112.0,clean,other,58ABK1GG4JU103853,fwd,sedan,silver,Carvana is the safer way to buy a car During t...,wy


In [6]:
features = ['price','manufacturer', 'condition', 'fuel', 'odometer', 'transmission']
data=v_cleaned[features]

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Reshape Sale Price data for scaling
prices = data['price'].values.reshape(-1, 1)  # Ensure it's numerical and reshaped

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the data
data['Price Norm'] = scaler.fit_transform(prices)
data = data.drop(columns=['price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Price Norm'] = scaler.fit_transform(prices)


In [8]:
# Predict `price` from vehicles dataset
print("Vehicles Dataset")
vehicles_model, X_train, mse, r2 = preprocess_and_predict(
    data, 
    target_column='Price Norm'
)

Vehicles Dataset
Evaluation for Price Norm:
Mean Squared Error: 0.034664241222644374
R-squared Score: 0.27713343337129437



In [9]:
evaluation_metrics = {
    'Metric': ['Mean Squared Error', 'R-squared'],
    'Value': [mse, r2]
}
evaluation_df = pd.DataFrame(evaluation_metrics)
evaluation_df

Unnamed: 0,Metric,Value
0,Mean Squared Error,0.034664
1,R-squared,0.277133


In [10]:
# Function to predict and reverse normalization
def predict_sale_price(input_data, model, scaler, X_train_columns):
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data], columns=input_data.keys())

    # One-hot encode input data to match training set structure
    input_encoded = pd.get_dummies(input_df, drop_first=True)

    # Reindex the input_encoded DataFrame to match the training data columns
    # This will add missing columns with 0 and align the columns
    input_encoded = input_encoded.reindex(columns=X_train_columns, fill_value=0)

    # Predict normalized Sale Price
    price_normalized = model.predict(input_encoded)[0]

    # Reverse normalization
    x_min = scaler.data_min_[0]
    x_max = scaler.data_max_[0]
    price = price_normalized * (x_max - x_min) + x_min

    return price

In [11]:
data

Unnamed: 0,manufacturer,condition,fuel,odometer,transmission,Price Norm
0,gmc,good,gas,57923.0,other,0.574092
1,chevrolet,good,gas,71229.0,other,0.386084
2,chevrolet,good,gas,19160.0,other,0.676643
3,toyota,good,gas,41124.0,other,0.529654
4,ford,excellent,gas,128000.0,automatic,0.256358
...,...,...,...,...,...,...
390003,nissan,good,gas,32226.0,other,0.403176
390004,volvo,good,gas,12029.0,other,0.522817
390005,cadillac,good,diesel,4174.0,other,0.598021
390006,lexus,good,gas,30112.0,other,0.495471


In [12]:
# Example user input

print("Enter the Manufacturer: ")
manu = input()
print()
print("Condition: ")
cond = input()
print()
print("Fuel: ")
fuel = input()
print()
print("Odometer: ")
odo = float(input())
print()
print("Transmission: ")
transm = input()
print()


user_input = {
    'manufacturer': manu,
    'condition': cond,
    'fuel': fuel,
    'odometer': odo,
    'transmission': transm
}

Enter the Manufacturer: 
toyota

Condition: 
good

Fuel: 
other

Odometer: 
59030.0

Transmission: 
automatic



In [13]:
# Predict sale price for the user input
predicted_price = predict_sale_price(user_input, vehicles_model, scaler, X_train.columns)
print(f"Predicted Sale Price for input {user_input}: ${predicted_price:.2f}")

Predicted Sale Price for input {'manufacturer': 'toyota', 'condition': 'good', 'fuel': 'other', 'odometer': 59030.0, 'transmission': 'automatic'}: $26629.35


In [14]:
#predicted data
user_data = pd.DataFrame([user_input])
user_data['Sale Price'] = predicted_price
user_data

Unnamed: 0,manufacturer,condition,fuel,odometer,transmission,Sale Price
0,toyota,good,other,59030.0,automatic,26629.346709


In [15]:
from mysql.connector import connect, Error
import pandas as pd

conn = None

def connect_to_db():
    try:
        global conn
        conn = connect(
            host='localhost',
            user='root',
            password='1234',
            database='final'
        )
        print("Connection successful!")
        return conn
    except Error as e:
        print(f"Error: {e}")

def create_schema(conn):
    schema_sql = """
        USE final;
        CREATE TABLE IF NOT EXISTS vehicles_predictor (
        prediction_id INT AUTO_INCREMENT PRIMARY KEY,
        manufacturer VARCHAR(255),
        `condition` VARCHAR(255),
        fuel VARCHAR(255),
        odometer DOUBLE,
        transmission VARCHAR(255),
        sale_price DOUBLE
        );
        """
    try:
        cursor = conn.cursor()
        # Execute schema creation SQL
        for result in cursor.execute(schema_sql, multi=True):
            if result.with_rows:
                print(f"Rows returned: {result.fetchall()}")
            else:
                print(f"Affected {result.rowcount} rows.")
        print("Schema created successfully.")
    except Error as e:
        print(f"Error creating schema: {e}")
    finally:
        cursor.close()

In [16]:
from sqlalchemy import create_engine
def connect_to_db_sqlalchemy():
    try:
        engine = create_engine("mysql+mysqlconnector://root:1234@localhost/final")
        print("SQLAlchemy Engine created successfully!")
        return engine
    except Error as e:
        print(f"Error: {e}")

def insert_dataframe_to_table(conn, df):
    insert_query = """
        INSERT INTO vehicles_predictor (manufacturer, `condition`, fuel, odometer, transmission, sale_price)
        VALUES (%s, %s, %s, %s, %s, %s)
    """
    try:
        cursor = conn.cursor()
        data = list(df.itertuples(index=False, name=None))
        cursor.executemany(insert_query, data)  
        conn.commit()
        print(f"Inserted {cursor.rowcount} rows successfully.")
    except Error as e:
        print(f"Error inserting data: {e}")
    finally:
        cursor.close()

In [17]:
conn = connect_to_db()
if conn:
    try:
        create_schema(conn)  
        insert_dataframe_to_table(conn, user_data)  
    finally:
        conn.close()
else:
    print("Failed to connect to the database.")

Connection successful!
Affected 0 rows.
Affected 0 rows.
Schema created successfully.
Inserted 1 rows successfully.
