In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [147]:
df = pd.read_csv('Luxury watch.csv')

In [148]:
print(df.head())

       Brand       Model    Case Material   Strap Material Movement Type  \
0      Rolex  Submariner  Stainless Steel  Stainless Steel     Automatic   
1      Omega   Seamaster         Titanium           Rubber     Automatic   
2  Tag Heuer     Carrera  Stainless Steel          Leather     Automatic   
3  Breitling   Navitimer  Stainless Steel  Stainless Steel     Automatic   
4    Cartier   Tank Solo  Stainless Steel          Leather        Quartz   

  Water Resistance  Case Diameter (mm)  Case Thickness (mm)  Band Width (mm)  \
0       300 meters                40.0                13.00             20.0   
1       600 meters                43.5                14.47             21.0   
2       100 meters                41.0                13.00             20.0   
3        30 meters                43.0                14.25             22.0   
4        30 meters                31.0                 6.05             20.0   

  Dial Color Crystal Material Complications Power Reserve Pric

In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Brand                507 non-null    object 
 1   Model                507 non-null    object 
 2   Case Material        507 non-null    object 
 3   Strap Material       507 non-null    object 
 4   Movement Type        507 non-null    object 
 5   Water Resistance     507 non-null    object 
 6   Case Diameter (mm)   507 non-null    float64
 7   Case Thickness (mm)  507 non-null    float64
 8   Band Width (mm)      507 non-null    float64
 9   Dial Color           507 non-null    object 
 10  Crystal Material     507 non-null    object 
 11  Complications        385 non-null    object 
 12  Power Reserve        493 non-null    object 
 13  Price (USD)          506 non-null    object 
dtypes: float64(3), object(11)
memory usage: 55.6+ KB


In [150]:
df.columns = df.columns.str.replace(r"\s*\(.*?\)", "", regex=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             507 non-null    object 
 1   Model             507 non-null    object 
 2   Case Material     507 non-null    object 
 3   Strap Material    507 non-null    object 
 4   Movement Type     507 non-null    object 
 5   Water Resistance  507 non-null    object 
 6   Case Diameter     507 non-null    float64
 7   Case Thickness    507 non-null    float64
 8   Band Width        507 non-null    float64
 9   Dial Color        507 non-null    object 
 10  Crystal Material  507 non-null    object 
 11  Complications     385 non-null    object 
 12  Power Reserve     493 non-null    object 
 13  Price             506 non-null    object 
dtypes: float64(3), object(11)
memory usage: 55.6+ KB


In [151]:
df['Price'] = df['Price'].str.replace('$', '').str.replace(',', '').astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             507 non-null    object 
 1   Model             507 non-null    object 
 2   Case Material     507 non-null    object 
 3   Strap Material    507 non-null    object 
 4   Movement Type     507 non-null    object 
 5   Water Resistance  507 non-null    object 
 6   Case Diameter     507 non-null    float64
 7   Case Thickness    507 non-null    float64
 8   Band Width        507 non-null    float64
 9   Dial Color        507 non-null    object 
 10  Crystal Material  507 non-null    object 
 11  Complications     385 non-null    object 
 12  Power Reserve     493 non-null    object 
 13  Price             506 non-null    float64
dtypes: float64(4), object(10)
memory usage: 55.6+ KB


In [152]:
df['Water Resistance'] = df['Water Resistance'].str.replace(' meters', '', regex=False).astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             507 non-null    object 
 1   Model             507 non-null    object 
 2   Case Material     507 non-null    object 
 3   Strap Material    507 non-null    object 
 4   Movement Type     507 non-null    object 
 5   Water Resistance  507 non-null    float64
 6   Case Diameter     507 non-null    float64
 7   Case Thickness    507 non-null    float64
 8   Band Width        507 non-null    float64
 9   Dial Color        507 non-null    object 
 10  Crystal Material  507 non-null    object 
 11  Complications     385 non-null    object 
 12  Power Reserve     493 non-null    object 
 13  Price             506 non-null    float64
dtypes: float64(5), object(9)
memory usage: 55.6+ KB


In [153]:
df['Power Reserve'].unique()

array(['48 hours', '60 hours', '42 hours', '70 hours', nan, '45 hours',
       '50 hours', '270 days', '38 hours', '80 hours', '64 hours',
       '40 hours', '56 hours', '4,200', '120 hours', '68 hours',
       '54 hours', '46 hours', '72 hours', '44 hours', '210 days',
       '65 hours', '43 hours', '41 hours', '55 hours', '168 hours'],
      dtype=object)

In [154]:
df['Power Reserve'].value_counts()

Unnamed: 0_level_0,count
Power Reserve,Unnamed: 1_level_1
60 hours,82
42 hours,55
70 hours,50
50 hours,40
72 hours,38
120 hours,30
45 hours,29
48 hours,27
38 hours,27
40 hours,22


In [155]:
print(df['Power Reserve'].isna().sum())

14


In [156]:
df['Power Reserve'] = df['Power Reserve'].apply(lambda x:
    float(x.replace(' days', '').strip().replace(',', '')) * 24 if isinstance(x, str) and 'days' in x
    else float(x.replace(' hours', '').strip().replace(',', '')) if isinstance(x, str)
    else x)
print(df['Power Reserve'].isna().sum())

14


In [157]:
df['Power Reserve'].unique()

array([  48.,   60.,   42.,   70.,   nan,   45.,   50., 6480.,   38.,
         80.,   64.,   40.,   56., 4200.,  120.,   68.,   54.,   46.,
         72.,   44., 5040.,   65.,   43.,   41.,   55.,  168.])

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             507 non-null    object 
 1   Model             507 non-null    object 
 2   Case Material     507 non-null    object 
 3   Strap Material    507 non-null    object 
 4   Movement Type     507 non-null    object 
 5   Water Resistance  507 non-null    float64
 6   Case Diameter     507 non-null    float64
 7   Case Thickness    507 non-null    float64
 8   Band Width        507 non-null    float64
 9   Dial Color        507 non-null    object 
 10  Crystal Material  507 non-null    object 
 11  Complications     385 non-null    object 
 12  Power Reserve     493 non-null    float64
 13  Price             506 non-null    float64
dtypes: float64(6), object(8)
memory usage: 55.6+ KB


In [159]:
df[df['Complications'].isna()]

Unnamed: 0,Brand,Model,Case Material,Strap Material,Movement Type,Water Resistance,Case Diameter,Case Thickness,Band Width,Dial Color,Crystal Material,Complications,Power Reserve,Price
4,Cartier,Tank Solo,Stainless Steel,Leather,Quartz,30.0,31.0,6.05,20.0,Silver,Sapphire,,,2800.0
5,Jaeger-LeCoultre,Reverso,Stainless Steel,Leather,Manual,30.0,42.9,9.20,20.0,Black,Sapphire,,45.0,5500.0
9,Hamilton,Khaki Field,Stainless Steel,Leather,Automatic,100.0,38.0,9.80,20.0,Black,Sapphire,,80.0,495.0
14,Breguet,Classique,18k White Gold,Leather,Manual,30.0,38.0,8.65,20.0,Silver,Sapphire,,40.0,16000.0
19,Panerai,Luminor Base Logo,Stainless Steel,Leather,Manual,100.0,44.0,10.50,22.0,Black,Sapphire,,56.0,3900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,Girard-Perregaux,Laureato,Stainless Steel,Stainless Steel,Automatic,100.0,42.0,10.88,25.0,Blue,Sapphire,,54.0,7500.0
485,Jaquet Droz,Grande Seconde,Stainless Steel,Leather,Automatic,30.0,43.0,11.48,22.0,White,Sapphire,,68.0,8500.0
491,IWC,Portugieser,Stainless Steel,Leather,Automatic,30.0,42.3,14.20,22.0,Blue,Sapphire,,60.0,9100.0
497,Tudor,Black Bay,Stainless Steel,Leather,Automatic,200.0,41.0,14.75,22.0,Black,Sapphire,,70.0,3800.0


In [160]:
df['Complications'] = df['Complications'].fillna('None')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             507 non-null    object 
 1   Model             507 non-null    object 
 2   Case Material     507 non-null    object 
 3   Strap Material    507 non-null    object 
 4   Movement Type     507 non-null    object 
 5   Water Resistance  507 non-null    float64
 6   Case Diameter     507 non-null    float64
 7   Case Thickness    507 non-null    float64
 8   Band Width        507 non-null    float64
 9   Dial Color        507 non-null    object 
 10  Crystal Material  507 non-null    object 
 11  Complications     507 non-null    object 
 12  Power Reserve     493 non-null    float64
 13  Price             506 non-null    float64
dtypes: float64(6), object(8)
memory usage: 55.6+ KB


In [161]:
print(df['Case Material'].value_counts())

Case Material
Stainless Steel           404
Titanium                   37
Rose Gold                  12
18k Rose Gold              10
Ceramic                     9
White Gold                  9
18k White Gold              7
18K White Gold              5
18K Rose Gold               3
Carbon Fiber                3
18K Yellow Gold             2
German Submarine Steel      1
High-Tech Ceramic           1
Bronze                      1
Yellow Gold                 1
18k King Gold               1
18k Yellow Gold             1
Name: count, dtype: int64


In [162]:
df['Case Material'] = df['Case Material'].str.lower()

def normalize_case_material(x):
    if 'rose gold' in x or 'king gold' in x:
        return 'Rose Gold'
    elif 'white gold' in x:
        return 'White Gold'
    elif 'yellow gold' in x:
        return 'Yellow Gold'
    elif 'steel' in x:
        return 'Stainless Steel'
    elif 'titanium' in x:
        return 'Titanium'
    elif 'ceramic' in x:
        return 'Ceramic'
    elif 'carbon' in x:
        return 'Carbon Fiber'
    elif 'bronze' in x:
        return 'Bronze'
    else:
        return x.title()

df['Case Material'] = df['Case Material'].apply(normalize_case_material)

print(df['Case Material'].value_counts())

Case Material
Stainless Steel    405
Titanium            37
Rose Gold           26
White Gold          21
Ceramic             10
Yellow Gold          4
Carbon Fiber         3
Bronze               1
Name: count, dtype: int64


In [163]:
df['Strap Material'].value_counts()

Unnamed: 0_level_0,count
Strap Material,Unnamed: 1_level_1
Leather,224
Stainless Steel,166
Rubber,74
Fabric,11
Canvas,10
Silicone,5
NATO Strap,4
Jubilee Bracelet,3
Jubilee,2
Alligator,2


In [164]:
df['Strap Material'] = df['Strap Material'].str.lower()

def normalize_strap_material(x):
    if 'leather' in x:
        return 'Leather'
    elif 'steel' in x:
        return 'Steel'
    elif 'rubber' in x:
        return 'Rubber'
    elif 'fabric' in x:
        return 'Fabric'
    elif 'canvas' in x:
        return 'Canvas'
    elif 'nato' in x:
        return 'Nato'
    elif 'jubilee' in x:
        return 'Jubilee'
    elif 'alligator' in x:
        return 'Alligator'
    elif 'rose gold' in x:
        return 'Rose Gold'
    elif 'titanium' in x:
        return 'Titanium'
    elif 'Textile' in x:
        return 'Textile'
    else:
        return x.title()

df['Strap Material'] = df['Strap Material'].apply(normalize_strap_material)

print(df['Strap Material'].value_counts())

Strap Material
Leather      224
Steel        166
Rubber        74
Fabric        11
Canvas        10
Nato           5
Jubilee        5
Silicone       5
Titanium       2
Alligator      2
Rose Gold      2
Textile        1
Name: count, dtype: int64


In [165]:
df['Movement Type'].value_counts()

Unnamed: 0_level_0,count
Movement Type,Unnamed: 1_level_1
Automatic,442
Manual,51
Quartz,12
Eco-Drive,2


In [166]:
df['Dial Color'].value_counts()

Unnamed: 0_level_0,count
Dial Color,Unnamed: 1_level_1
Black,217
Blue,136
Silver,118
White,33
Ivory,1
Champagne,1
Grey,1


In [167]:
df['Crystal Material'].value_counts()

Unnamed: 0_level_0,count
Crystal Material,Unnamed: 1_level_1
Sapphire,489
Hesalite,13
Hardlex,3
Mineral,2


In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             507 non-null    object 
 1   Model             507 non-null    object 
 2   Case Material     507 non-null    object 
 3   Strap Material    507 non-null    object 
 4   Movement Type     507 non-null    object 
 5   Water Resistance  507 non-null    float64
 6   Case Diameter     507 non-null    float64
 7   Case Thickness    507 non-null    float64
 8   Band Width        507 non-null    float64
 9   Dial Color        507 non-null    object 
 10  Crystal Material  507 non-null    object 
 11  Complications     507 non-null    object 
 12  Power Reserve     493 non-null    float64
 13  Price             506 non-null    float64
dtypes: float64(6), object(8)
memory usage: 55.6+ KB


In [169]:
category_col = ['Case Material', 'Strap Material', 'Movement Type', 'Dial Color', 'Crystal Material']
df[category_col] = df[category_col].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Brand             507 non-null    object  
 1   Model             507 non-null    object  
 2   Case Material     507 non-null    category
 3   Strap Material    507 non-null    category
 4   Movement Type     507 non-null    category
 5   Water Resistance  507 non-null    float64 
 6   Case Diameter     507 non-null    float64 
 7   Case Thickness    507 non-null    float64 
 8   Band Width        507 non-null    float64 
 9   Dial Color        507 non-null    category
 10  Crystal Material  507 non-null    category
 11  Complications     507 non-null    object  
 12  Power Reserve     493 non-null    float64 
 13  Price             506 non-null    float64 
dtypes: category(5), float64(6), object(3)
memory usage: 39.7+ KB


In [170]:
df['Power Reserve'] = df['Power Reserve'].fillna(df['Power Reserve'].median())
df['Price'] = df['Price'].fillna(df['Price'].median())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Brand             507 non-null    object  
 1   Model             507 non-null    object  
 2   Case Material     507 non-null    category
 3   Strap Material    507 non-null    category
 4   Movement Type     507 non-null    category
 5   Water Resistance  507 non-null    float64 
 6   Case Diameter     507 non-null    float64 
 7   Case Thickness    507 non-null    float64 
 8   Band Width        507 non-null    float64 
 9   Dial Color        507 non-null    category
 10  Crystal Material  507 non-null    category
 11  Complications     507 non-null    object  
 12  Power Reserve     507 non-null    float64 
 13  Price             507 non-null    float64 
dtypes: category(5), float64(6), object(3)
memory usage: 39.7+ KB
