In [1]:
import json
import pandas as pd

In [2]:
def read_json_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    return json_data

In [3]:
def json_to_dataframe(json_data):
    df = pd.DataFrame(json_data)
    return df

In [4]:
def extract_info(title):
        brand = "BMW"
        model = None
        car_type = None
        
        # Define the possible models
        possible_models = [
            "1M", "3GT", "5GT", "6GT", "i3", "i4", "i5", "i7", "i8", "Inny", "iX", "iX1", "iX2", "iX3",
            "M2", "M3", "M4", "M5", "M6", "M8", "Seria 1", "Seria 2", "Seria 3", "Seria 4", "Seria 5",
            "Seria 6", "Seria 7", "Seria 8", "X1 M", "X2 M", "X3 M", "X4 M", "X5 M", "X6 M", "X7 M", "XM", "Z1 M", "Z3 M", "Z4 M", "Z8 M",
            "X1", "X2", "X3", "X4", "X5", "X6", "X7", "Z1", "Z3", "Z4", "Z8"
        ]
        
        # Try to match the model from the title
        for m in possible_models:
            if m in title:
                model = m
                _car_type = title.split(m)
                if len(_car_type) > 1 and _car_type[1]:
                    car_type = _car_type[1].strip()
                break
        
        return brand, model, car_type

In [5]:
def test_extract_info():
    # Prepare sample data
    data = [
        "BMW M5",
        "BMW X1 xDrive18d xLine",
        "BMW M3",
        "BMW Seria 5",
        "BMW X5 3.0 d Edition Exclusive",
        "BMW Seria 3 330i Sport Line",
        "BMW X6 40d xDrive",
        "BMW Seria 1 116i",
        "BMW X5 xDrive25d sport",
        "BMW Seria 3 320d DPF Touring Edition Fleet",
    ]
    # Define expected results
    expected_brands = ["BMW"] * len(data)
    expected_models = ["M5", "X1", "M3", "Seria 5", "X5", "Seria 3", "X6", "Seria 1", "X5", "Seria 3"]
    expected_car_types = [
        None,
        "xDrive18d xLine",
        None,
        None,
        "3.0 d Edition Exclusive",
        "330i Sport Line",
        "40d xDrive",
        "116i",
        "xDrive25d sport",
        "320d DPF Touring Edition Fleet"
    ]
    
    for t,b,m,c in zip(data, expected_brands, expected_models, expected_car_types):
        print(f"Title {t}")
        got_b, got_m, got_c = extract_info(t)
        assert got_b == b, f"Expected brands {b}, but got {got_b}"
        assert got_m == m, f"Expected models {m}, but got {got_m}"
        assert got_c == c, f"Expected car types {c}, but got {got_c}"
        print(f"Title {t} passed")

# Run the test
test_extract_info()

Title BMW M5
Title BMW M5 passed
Title BMW X1 xDrive18d xLine
Title BMW X1 xDrive18d xLine passed
Title BMW M3
Title BMW M3 passed
Title BMW Seria 5
Title BMW Seria 5 passed
Title BMW X5 3.0 d Edition Exclusive
Title BMW X5 3.0 d Edition Exclusive passed
Title BMW Seria 3 330i Sport Line
Title BMW Seria 3 330i Sport Line passed
Title BMW X6 40d xDrive
Title BMW X6 40d xDrive passed
Title BMW Seria 1 116i
Title BMW Seria 1 116i passed
Title BMW X5 xDrive25d sport
Title BMW X5 xDrive25d sport passed
Title BMW Seria 3 320d DPF Touring Edition Fleet
Title BMW Seria 3 320d DPF Touring Edition Fleet passed


In [6]:
def convert_price(row):
    if row['price_currency'] == "EUR":
        return row['price'] * 4.33
    return row['price']

In [7]:
#read data
file_path = '../../data.json'
json_data = read_json_from_file(file_path)
df = json_to_dataframe(json_data)
print(df.info())
#print(df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400 entries, 0 to 6399
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           6400 non-null   object
 1   url             6400 non-null   object
 2   id              6400 non-null   object
 3   image_url       6396 non-null   object
 4   img_local       6344 non-null   object
 5   type_of_seller  4221 non-null   object
 6   price           6400 non-null   object
 7   price_currency  6400 non-null   object
 8   mileage         6400 non-null   object
 9   fuel_type       6400 non-null   object
 10  gearbox         6400 non-null   object
 11  year            6400 non-null   object
dtypes: object(12)
memory usage: 600.1+ KB
None


In [8]:
# cleanning part 1
cleaned_data = df.drop_duplicates(subset="id")
cleaned_data = df[df['img_local'].notna()]
cleaned_data = cleaned_data.drop(columns=['url', 'image_url', 'id'])
cleaned_data[['brand', 'model', 'car_type']] = cleaned_data['title'].apply(lambda x: pd.Series(extract_info(x)))
cleaned_data = cleaned_data.drop(columns=['title', 'brand'])
cleaned_data.columns = cleaned_data.columns.str.lower().str.strip().str.replace(' ', '_')
cleaned_data['price'] = cleaned_data['price'].str.replace(' ', '').astype(int)
cleaned_data['year'] = cleaned_data['year'].str.replace(' ', '').astype(int)
cleaned_data['price'] = cleaned_data.apply(convert_price, axis=1)
cleaned_data['mileage'] = cleaned_data['mileage'].str.replace(' km', '').str.replace(' ', '').astype(int)
cleaned_data = cleaned_data.drop(columns=['price_currency'])

In [9]:
cleaned_data['type_of_seller'].value_counts(dropna=False)

type_of_seller
Prywatny sprzedawca    4183
None                   2150
Firma                    11
Name: count, dtype: int64

In [10]:
import matplotlib.pyplot as plt
from PIL import Image
import os

def show_samples(rows):
    # Ścieżka do folderu z obrazami
    image_folder = '..\\..\\data_img'

    # Wyświetlanie zdjęć dla wybranych rekordów
    plt.figure(figsize=(15, 10))
    for i, (_, row) in enumerate(rows.iterrows()):
        img_local = row['img_local']
        image_path = os.path.join(image_folder, img_local)
        
        if os.path.isfile(image_path):  # Sprawdzenie, czy plik istnieje
            img = Image.open(image_path)

            # Tworzenie subplotu dla każdego obrazu
            plt.subplot(4, 5, i + 1)  # Układ 4x5 dla 20 obrazów
            plt.imshow(img)
            plt.axis('off')
            plt.title(f"ID: {row['id']}")
        else:
            print(f"Obraz {os.path.abspath(image_path)} nie istnieje.")

    plt.tight_layout()
    plt.show()

In [11]:
#null_seller_rows = cleaned_data[cleaned_data['type_of_seller'].isna()].sample(n=20, random_state=15)
#show_samples(null_seller_rows)

In [12]:
#cleaning part 2
cleaned_data
cleaned_data['is_private'] = cleaned_data['type_of_seller'].apply(lambda x: True if x == "Prywatny sprzedawca" else False)
cleaned_data = cleaned_data.drop(columns=['type_of_seller'])

In [13]:
# left main part of car type
cleaned_data['car_type_main'] = cleaned_data['car_type'].apply(lambda x: x.strip().split(' ')[0].strip() if isinstance(x, str) else None)
pd.set_option('display.max_rows', 50)
car_type_main = cleaned_data['car_type_main'].value_counts(dropna=False)
cleaned_data = cleaned_data[(cleaned_data['car_type_main'] != '') & (cleaned_data['car_type_main'].notna())]

In [14]:
#cleaning part 3
cleaned_data = cleaned_data.drop(columns=['car_type'])


In [15]:
print(cleaned_data.info())
print(cleaned_data.head(5))

<class 'pandas.core.frame.DataFrame'>
Index: 4686 entries, 0 to 6399
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   img_local      4686 non-null   object 
 1   price          4686 non-null   float64
 2   mileage        4686 non-null   int64  
 3   fuel_type      4686 non-null   object 
 4   gearbox        4686 non-null   object 
 5   year           4686 non-null   int64  
 6   model          4686 non-null   object 
 7   is_private     4686 non-null   bool   
 8   car_type_main  4686 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(5)
memory usage: 334.1+ KB
None
             img_local     price  mileage fuel_type       gearbox  year  \
0  619888553496191.jpg  255000.0        6    Diesel  Automatyczna  2023   
1  878738710972359.jpg  249600.0        5    Diesel  Automatyczna  2024   
2  325534139055352.jpg   52990.0   230000    Diesel  Automatyczna  2016   
3  586230984198372.jpg  149000.0   18

In [16]:
a = cleaned_data['car_type_main'].value_counts(dropna=False)

In [17]:
cleaned_data.to_json('../../cleaned_base_data_with_car_type.json', orient='records', lines=True)


In [18]:
test_data = cleaned_data.sample(frac=0.15, random_state=42)
train_data = cleaned_data.drop(test_data.index)
test_data.to_json("../../test_data_with_car_type.json", orient="records", lines=True)
train_data.to_json("../../train_data_with_car_type.json", orient="records", lines=True)