In [62]:
#В качестве датасета выбран набор данных о винах: https://www.kaggle.com/datasets/geoffnel/evs-one-electric-vehicle-dataset

# Загружаем библиотеки
import numpy as np # работа с векторами
import matplotlib.pyplot as plt # рисовать графики
import pandas as pd # для работы с табличная
import sklearn # машинное обучение на python
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

## Home Assignment 1. Data processing

In [63]:
# Обработка данных
dataset = pd.read_csv('ElectricCarData_Clean.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Brand_name       103 non-null    object 
 1   Model_name       103 non-null    object 
 2   AccelSec         103 non-null    float64
 3   TopSpeed_KmH     103 non-null    int64  
 4   Range_Km         103 non-null    int64  
 5   Efficiency_WhKm  103 non-null    int64  
 6   FastCharge_KmH   103 non-null    object 
 7   RapidCharge      103 non-null    object 
 8   PowerTrain       103 non-null    object 
 9   PlugType         103 non-null    object 
 10  BodyStyle        103 non-null    object 
 11  Segment          103 non-null    object 
 12  Seats            103 non-null    int64  
 13  PriceEuro        103 non-null    int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 11.4+ KB


In [64]:
#удалим некоторые столбцы из-за большого количества пропусков в данных.
dataset = dataset.drop(columns = ['FastCharge_KmH', 'RapidCharge', 'PlugType', 'Segment'])

In [65]:
dataset.head(15)

Unnamed: 0,Brand_name,Model_name,AccelSec,TopSpeed_KmH,Range_Km,Efficiency_WhKm,PowerTrain,BodyStyle,Seats,PriceEuro
0,Tesla,Model 3 Long Range Dual Motor,4.6,233,450,161,AWD,Sedan,5,55480
1,Volkswagen,ID.3 Pure,10.0,160,270,167,RWD,Hatchback,5,30000
2,Polestar,2,4.7,210,400,181,AWD,Liftback,5,56440
3,BMW,iX3,6.8,180,360,206,RWD,SUV,5,68040
4,Honda,e,9.5,145,170,168,RWD,Hatchback,4,32997
5,Lucid,Air,2.8,250,610,180,AWD,Sedan,5,105000
6,Volkswagen,e-Golf,9.6,150,190,168,FWD,Hatchback,5,31900
7,Peugeot,e-208,8.1,150,275,164,FWD,Hatchback,5,29682
8,Tesla,Model 3 Standard Range Plus,5.6,225,310,153,RWD,Sedan,5,46380
9,Audi,Q4 e-tron,6.3,180,400,193,AWD,SUV,5,55000


In [66]:
#такие поля как Brand_name, Model_name, PowerTrain, BodyStyle имеют категориальный вид и с ними будет сложно работать. 
#Нужно их категоризовать

# Создаем экземпляр LabelEncoder
label_encoder = LabelEncoder()

dataset['brand'] = label_encoder.fit_transform(dataset['Brand_name'])
dataset['model'] = label_encoder.fit_transform(dataset['Model_name'])
dataset['power_type'] = label_encoder.fit_transform(dataset['PowerTrain'])

# а столбец BodyStyle, мы разделим следующим образом для удобства обработки данных
def categorize_body_style(style):
    if style == "SUV":
        return 1
    elif style == "Sedan":
        return 2
    else:
        return 0

dataset['body_type'] = dataset['BodyStyle'].apply(categorize_body_style)

#удалим старые поля Brand_name, Model_name, PowerTrain, BodyStyle
dataset = dataset.drop(columns = ['Brand_name','Model_name', 'PowerTrain', 'BodyStyle'])

#переименуем остальные столбцы в человеческий вид
new_column_names = {
    'AccelSec': 'acceleration',
    'TopSpeed_KmH': 'max_speed',
    'Range_Km': 'power_reserve',
    'Efficiency_WhKm': 'efficiency',
    'Seats': 'seats',
    'PriceEuro': 'price'
}
dataset.rename(columns=new_column_names, inplace=True)

In [67]:
dataset.head(15)

Unnamed: 0,acceleration,max_speed,power_reserve,efficiency,seats,price,brand,model,power_type,body_type
0,4.6,233,450,161,5,55480,30,46,0,2
1,10.0,160,270,167,5,30000,31,33,2,0
2,4.7,210,400,181,5,56440,23,0,0,0
3,6.8,180,360,206,5,68040,2,101,2,1
4,9.5,145,170,168,4,32997,9,78,2,0
5,2.8,250,610,180,5,105000,15,4,0,2
6,9.6,150,190,168,5,31900,31,83,1,0
7,8.1,150,275,164,5,29682,22,81,1,0
8,5.6,225,310,153,5,46380,30,48,2,2
9,6.3,180,400,193,5,55000,1,62,0,1


In [68]:
#Чтобы не допустить сильных разбросов при анализе, сделаем нормализацию данных

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(dataset)
dataset = pd.DataFrame(scaled_data, columns=dataset.columns)

In [69]:
dataset.head()

Unnamed: 0,acceleration,max_speed,power_reserve,efficiency,seats,price,brand,model,power_type,body_type
0,0.123153,0.383275,0.405714,0.337278,0.6,0.181407,0.9375,0.455446,0.0,1.0
1,0.389163,0.12892,0.2,0.372781,0.6,0.050654,0.96875,0.326733,1.0,0.0
2,0.128079,0.303136,0.348571,0.455621,0.6,0.186334,0.71875,0.0,0.0,0.0
3,0.231527,0.198606,0.302857,0.60355,0.6,0.24586,0.0625,1.0,1.0,0.5
4,0.364532,0.076655,0.085714,0.378698,0.4,0.066033,0.28125,0.772277,1.0,0.0


## Home Assignment 2. Correlation Coefficient