In [422]:
# %pip install pandas 
# %pip install numpy 
# %pip install matplotlib 
# %pip install seaborn
# %pip install scikit-learn


In [423]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [424]:
df = pd.read_csv('../datasets/cars.csv', encoding='latin-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Company Names              1218 non-null   object
 1   Cars Names                 1218 non-null   object
 2   Engines                    1218 non-null   object
 3   CC/Battery Capacity        1215 non-null   object
 4   HorsePower                 1218 non-null   object
 5   Total Speed                1218 non-null   object
 6   Performance(0 - 100 )KM/H  1212 non-null   object
 7   Cars Prices                1218 non-null   object
 8   Fuel Types                 1218 non-null   object
 9   Seats                      1218 non-null   object
 10  Torque                     1217 non-null   object
dtypes: object(11)
memory usage: 104.8+ KB


In [425]:
df.sample(20, random_state=42)

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
541,Porsche,Cayenne Turbo,4.0L V8 Twin Turbo,3996 cc,541 HP,286 km/h,4.0 sec,"$130,000",Petrol,5,770 Nm
259,HYUNDAI,Sonata,2.5L Inline-4,"2,500 cc",191 hp,240 km/h,8 sec,"$25,000",Petrol/Hybrid,5,250 Nm
43,LAMBORGHINI,AVENTADOR SVJ,V12,"6,498 cc",759 hp,350 km/h,2.8 sec,"$518,000",Petrol,2,720 Nm
1008,Peugeot,5008 GT,Inline-4,"1,598 cc",180 hp,210 km/h,8.2 sec,"$40,000",Petrol,7,250 Nm
584,Porsche,Macan T,2.0L Turbo Inline-4,1984 cc,265 hp,233 km/h,6.2,"$63,000",Petrol,5,400 Nm
310,Volkswagen,Crafter,2.0L Turbo Diesel I4,1968 cc,102 - 177 hp,160 km/h,13.0 sec,"$40,000",Diesel,3,410 Nm
657,Tata Motors,Nexon EV Prime,Permanent Magnet Synchronous,30.2 kWh,129 hp,120 km/h,9.4 sec,"$22,000",Electric,5,245 Nm
1211,Mazda,Millenia S,2.5L V6,"2,500 cc",200 hp,220 km/h,8.0 sec,"$30,000 - $35,000",Petrol,5,250 Nm
210,TOYOTA,VENZA,I4,"2,494 cc",219 hp,200 km/h,7.5 sec,"$33,400",Hybrid,5,400 Nm
155,BMW,118D,I4,"1,995 cc",150 hp,216 km/h,8.4 sec,"$34,000",Diesel,5,380 Nm


In [426]:
df.describe()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
count,1218,1218,1218,1215,1218,1218,1212,1218,1218,1218,1217
unique,37,1201,356,311,456,114,180,535,23,19,263
top,Nissan,Macan T,I4,"2,000 cc",355 hp,250 km/h,6.5 sec,"$35,000",Petrol,5,400 Nm
freq,149,2,64,31,23,145,45,36,871,692,72


In [427]:
# rename columns
df.rename(columns={'Company Names': 'company', 
                   'Cars Names': 'model', 
                   'Engines': 'engines', 
                   'CC/Battery Capacity': 'power_capacity',
                   'HorsePower': 'horsepower', 
                   'Total Speed': 'total_speed', 
                   'Performance(0 - 100 )KM/H': 'performance', 
                   'Cars Prices': 'prices',
                   'Fuel Types': 'fuel_types', 
                   'Seats': 'seats', 
                   'Torque': 'torque'}, 
          inplace=True)

In [428]:
import re
def extraer_valores(texto):
    match = re.search(r'(\d+(?:[\.,]\d+)?)', texto)  
    unidad = re.search(r'(cc|kwh)', texto, re.IGNORECASE)
    cantidad = match.group(1).replace(',', '.') if match else None
    unidad = unidad.group(1).lower() if unidad else None
    return float(cantidad), unidad

In [None]:
# clean columns
df['company'] = df['company'].str.capitalize()

# clean power_capacity
df["power_capacity"] = df["power_capacity"].str.lower()
df['power_capacity'] = df['power_capacity'].str.replace(r'[()]', '', regex=True)
df["power_capacity"] = df["power_capacity"].str.replace(r'standard', ' ', regex=True)
df[["power_capacity", "unit_power_capacity"]] = df["power_capacity"].str.extract(r'([\d,\.]+)\s*(\w+)', expand=True)
df["power_capacity"] = df["power_capacity"].str.replace(',', '').astype(float)

# clean prices
df["prices"] = df["prices"].str.replace(r'[$.,â‚¬]', '', regex=True)
df['prices'] = df['prices'].where(~df['prices'].str.contains('-', na=False), df['prices'].str.split('-', expand=True)[1])
df['prices'] = df['prices'].str.split(" ", expand=True)[0]

# clean performance
df['performance'] = df['performance'].str.lower()
df['performance'] = df['performance'].str.replace('sec', '')
df['performance'] = df['performance'].str.split(" ", expand=True)[0]

#clear torque
df['torque'] = df['torque'].str.lower()
df['torque'] = df['torque'].str.replace("nm", "")
df['torque'] = df['torque'].where(~df['torque'].str.contains("-", na=False), df['torque'].str.split('-',expand=True)[1])
df['torque'] = df['torque'].str.split(' ', expand=True)[0]

#clear horsepower
df['horsepower'] = df['horsepower'].str.lower().str.replace("hp", "")
df['horsepower'] = df['horsepower'].str.split('-', expand=True)[0]
df['horsepower'] = df['horsepower'].str.split(' ', expand=True)[0]
df["horsepower"] = df['horsepower'].str.replace(r'[^\d]', '', regex=True)


# clear top_speed
df["total_speed"] = df['total_speed'].str.replace(r'[^\d]', '', regex=True)




ValueError: invalid literal for int() with base 10: ''

In [None]:
df

In [None]:
companys = df['company'].value_counts()
companys.plot(kind='bar', figsize=(12,6))
plt.title('Number of Cars by Company')
plt.xlabel('Company')
plt.ylabel('Number of Cars')
plt.show()
