<a href="https://colab.research.google.com/github/zqiu6882/dataset/blob/main/2810_group.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/zqiu6882/dataset/main/vehicles.csv

--2023-04-05 16:25:43--  https://raw.githubusercontent.com/zqiu6882/dataset/main/vehicles.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16217415 (15M) [text/plain]
Saving to: ‘vehicles.csv’


2023-04-05 16:25:43 (271 MB/s) - ‘vehicles.csv’ saved [16217415/16217415]



In [2]:
# ================================================================================
# Import packages and Data
# ================================================================================

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

data=pd.read_csv('vehicles.csv')

In [3]:
# ================================================================================
# Data Cleaning - Choosing variables and filtering out irrelevant entries
# ================================================================================
# data=pd.read_csv('vehicles.csv')
variables = ['comb08','displ','atvType','barrels08','city08','co2TailpipeGpm','cylinders','drive','feScore', 
             'fuelCost08','fuelType','ghgScore','highway08','hlv','hpv','lv2','lv4','mpgData', 
             'pv2','pv4','UCity','UHighway','year','guzzler','tCharger','sCharger','startStop','trany','VClass']
data2 = data[variables]

# Only cars made in year 1984-2021 should be included. 
data2 = data2[data2.year.isin(range(1984,2021))] 

# only vehicles with these fuel types are considered
data2 = data2[data2.fuelType.isin(['Regular', 'Premium', 'Diesel', 'Midgrade'])] 
data2 = data2[~data2.atvType.isin(['Hybrid'])]
variables.remove('atvType')
data2.drop(columns = ['atvType'], inplace = True)

# Converting sCharger and tCharger to dummy variables
data2.tCharger.replace([np.nan,'T'],[0,1],inplace = True)
data2.sCharger.replace([np.nan,'S'],[0,1],inplace = True)
# Converting guzzler to a dummy variable: 1 if this vehicle is subject to the gas guzzler tax and 0 otherwise
data2.guzzler.replace(['T','G','S',np.nan],[1,1,0,0],inplace = True)
# Converting startStop to a dummy variable: 1 if this vehicle has start-stop technology
data2.startStop.replace(['Y','N',np.nan],[1,0,0],inplace = True)
# Converting mpgData to a dummy variable: 1 if this vehicle has mpg Data
data2.mpgData.replace(['Y','N'],[1,0],inplace = True)

# Converting year built to age of the vehicle
variables.append('age'); variables.remove('year');
data2['age'] = 2023 - data2.year
data2.drop(columns = ['year'], inplace = True)

# Converting createdOn and modifiedOn to Pandas DateTime
# data2.modifiedOn = pd.to_datetime(data2.modifiedOn)
# data2.createdOn = pd.to_datetime(data2.createdOn)

# Converting trany into a Categorical variable with 'Automatic', 'Manual', and -1 (na) being the groups
data2.trany[data2.trany.astype(str).str.startswith('Automatic')] = 'Automatic'
data2.trany[data2.trany.astype(str).str.startswith('Manual')] = 'Manual'
data2.trany.replace([np.nan],[-1],inplace = True) # only two obs in this group

numerical = ['displ','barrels08','city08','co2TailpipeGpm','cylinders','fuelCost08','highway08','hlv','hpv',
             'lv2','lv4','pv2','pv4','UCity','UHighway','age']
categorical = ['drive','fuelType','mpgData','guzzler','tCharger','sCharger','startStop',
               'feScore', 'ghgScore','trany','VClass'] # Deal with these later
# Remove missing values
data2.drop(data2[data2.displ.isna()].index, inplace = True)
data2.drop(data2[data2.comb08.isna()].index, inplace = True)
data2.drop(data2[data2.trany == -1].index, inplace = True)
# print(data2.shape)
data2.head()

Unnamed: 0,comb08,displ,barrels08,city08,co2TailpipeGpm,cylinders,drive,feScore,fuelCost08,fuelType,...,pv4,UCity,UHighway,guzzler,tCharger,sCharger,startStop,trany,VClass,age
0,21,2.0,14.167143,19,423.190476,4.0,Rear-Wheel Drive,-1,2900,Regular,...,0,23.3333,35.0,0.0,0,0,0.0,Manual,Two Seaters,38
1,11,4.9,27.046364,9,807.909091,12.0,Rear-Wheel Drive,-1,5550,Regular,...,0,11.0,19.0,1.0,0,0,0.0,Manual,Two Seaters,38
2,27,2.2,11.018889,23,329.148148,4.0,Front-Wheel Drive,-1,2250,Regular,...,0,29.0,47.0,0.0,0,0,0.0,Manual,Subcompact Cars,38
3,11,5.2,27.046364,10,807.909091,8.0,Rear-Wheel Drive,-1,5550,Regular,...,0,12.2222,16.6667,0.0,0,0,0.0,Automatic,Vans,38
4,19,2.2,15.658421,17,467.736842,4.0,4-Wheel or All-Wheel Drive,-1,3800,Premium,...,90,21.0,32.0,0.0,1,0,0.0,Manual,Compact Cars,30


In [4]:
# ================================================================================
# Splitting training and test sets
# ================================================================================


state=520005325  #+加上你们的学生号
train = data2.sample(frac=0.8, random_state=state)
test = data2[data2.index.isin(train.index)==False].copy() # Only for prediction

train=train.reset_index(drop=True)
test=test.reset_index(drop=True)

In [5]:
data2.shape

(39536, 28)