In [1]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

2023-08-10 11:21:28.142562: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import geopy
from geopy.geocoders import Nominatim

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/submit_sample.csv')

In [4]:
lat_long_df = pd.read_csv('lat_long_info.csv')

In [5]:
train_prep = train.merge(lat_long_df, on = 'region', how = 'left')
test_prep = test.merge(lat_long_df, on = 'region', how = 'left')
train_prep = train_prep.drop(['region', 'state'], axis = 1)
test_prep = test_prep.drop(['region', 'state'], axis = 1)

In [16]:
def generate_embeddings(df, cat_col, name):
    df[cat_col].fillna('other', inplace=True)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df[cat_col])
    sequences = tokenizer.texts_to_sequences(df[cat_col])

    max_sequence_length = max(len(seq) for seq in sequences)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
    
    vocab_size = len(tokenizer.word_index) + 1
    
    embedding_size = 2
    
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = embedding_size, input_length = max_sequence_length, name="embedding"))
    model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])
    model.fit(x=padded_sequences, y=padded_sequences, epochs=50, batch_size=16, verbose=0)
    
    embedding_layer = model.layers[0]
    learned_embeddings = embedding_layer.get_weights()[0] 

    new_df = pd.DataFrame()
    new_df[cat_col] = df[cat_col]
    
    embeddings = {}
    for word, index in tokenizer.word_index.items():
        embeddings[word] = learned_embeddings[index]

    embedding_dimensions = learned_embeddings.shape[1]
    for i in range(embedding_dimensions):
        embedding_column = f'{name}_{i+1}'
        new_df[embedding_column] = new_df[cat_col].map(lambda x: embeddings[x][i])
    
    return new_df.drop(cat_col, axis=1)

In [7]:
def preprocess_text(text):
    # Full-width ASCII characters mapping
    full_width_ascii_mapping = {
        '０': '0', '１': '1', '２': '2', '３': '3', '４': '4',
        '５': '5', '６': '6', '７': '7', '８': '8', '９': '9',
        'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E',
        'Ｆ': 'F', 'Ｇ': 'G', 'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J',
        'Ｋ': 'K', 'Ｌ': 'L', 'Ｍ': 'M', 'Ｎ': 'N', 'Ｏ': 'O',
        'Ｐ': 'P', 'Ｑ': 'Q', 'Ｒ': 'R', 'Ｓ': 'S', 'Ｔ': 'T',
        'Ｕ': 'U', 'Ｖ': 'V', 'Ｗ': 'W', 'Ｘ': 'X', 'Ｙ': 'Y',
        'Ｚ': 'Z', 'ａ': 'a', 'ｂ': 'b', 'ｃ': 'c', 'ｄ': 'd',
        'ｅ': 'e', 'ｆ': 'f', 'ｇ': 'g', 'ｈ': 'h', 'ｉ': 'i',
        'ｊ': 'j', 'ｋ': 'k', 'ｌ': 'l', 'ｍ': 'm', 'ｎ': 'n',
        'ｏ': 'o', 'ｐ': 'p', 'ｑ': 'q', 'ｒ': 'r', 'ｓ': 's',
        'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w', 'ｘ': 'x',
        'ｙ': 'y', 'ｚ': 'z', 'ᴄ': 'c', '−': '-', 'ー': '-',
        'α': 'a'
    }

    # Replace full-width ASCII characters with their corresponding ASCII characters
    for fw_char, ascii_char in full_width_ascii_mapping.items():
        text = text.replace(fw_char, ascii_char)

    # Remove other full-width spaces (U+3000) and regular spaces
    text = text.replace('\u3000', '').replace(' ', '').replace('-', '').lower().strip()

    return text

In [41]:
train_prep

Unnamed: 0,id,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,...,car_age,gas_fueled,diesel_fueled,hybrid_fueled,electric_fueled,front_wheel_drive,four_wheel_drive,rear_wheel_drive,automatic_transmission,manual_transmission
0,0,1949,bmw,5.0,6.0,gas,115148,clean,manual,rwd,...,74,1,0,0,0,0,0,1,0,1
1,1,2013,toyota,1.0,8.0,gas,172038,clean,automatic,rwd,...,10,1,0,0,0,0,0,1,1,0
2,2,1998,ford,2.0,6.0,gas,152492,clean,automatic,fwd,...,25,1,0,0,0,1,0,0,1,0
3,3,2014,ford,5.0,4.0,gas,104118,clean,manual,fwd,...,9,1,0,0,0,1,0,0,0,1
4,4,2005,ford,5.0,6.0,gas,144554,clean,manual,fwd,...,18,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,27527,2008,ford,2.0,6.0,gas,26660,clean,automatic,rwd,...,15,1,0,0,0,0,0,1,1,0
27528,27528,2007,ford,5.0,8.0,gas,108072,clean,automatic,rwd,...,16,1,0,0,0,0,0,1,1,0
27529,27529,2019,jeep,,6.0,gas,139908,clean,automatic,4wd,...,4,1,0,0,0,0,1,0,1,0
27530,27530,2007,jeep,5.0,6.0,gas,112326,clean,automatic,4wd,...,16,1,0,0,0,0,1,0,1,0


In [9]:
train_prep['manufacturer'] = train_prep['manufacturer'].apply(preprocess_text)
col_to_keep = []
for brand, values in dict(train_prep['manufacturer'].value_counts()).items():
    if values >= 150:
        col_to_keep.append(brand)
train_prep['manufacturer'] = train_prep['manufacturer'].apply(lambda x: 'other' if x not in col_to_keep else x)

In [17]:
brand_embd = generate_embeddings(train_prep, 'manufacturer', 'brand_embeddings')

In [18]:
brand_embd

Unnamed: 0,brand_embeddings_1,brand_embeddings_2
0,5.660540,5.660540
1,5.743539,5.743539
2,5.527442,5.527442
3,5.527442,5.527442
4,5.527442,5.527442
...,...,...
27527,5.527442,5.527442
27528,5.527442,5.527442
27529,5.888309,5.888309
27530,5.888309,5.888309


In [19]:
train_prep['condition'] = train_prep['condition'].apply(preprocess_text)
condition_embd = generate_embeddings(train_prep, 'condition', 'condition_embeddings')

In [20]:
fuel_embd = generate_embeddings(train_prep, 'fuel', 'fuel_embeddings')

In [21]:
transmission_embd = generate_embeddings(train_prep, 'transmission', 'transmission_embeddings')

In [22]:
drive_embd = generate_embeddings(train_prep, 'drive', 'drive_embeddings')

In [34]:
train_prep['title_status'] = train_prep['title_status'].fillna('clean')
train_prep['title_status'] = train_prep['title_status'].apply(preprocess_text)
title_status_embd = generate_embeddings(train_prep, 'title_status', 'title_status_embeddings')

In [24]:
train_prep['size'] = train_prep['size'].apply(preprocess_text)
size_embd = generate_embeddings(train_prep, 'size', 'size_embeddings')

In [43]:
train_prep['type'] = train_prep['type'].fillna('other')
train_prep['type'] = train_prep['type'].apply(preprocess_text)
type_embd = generate_embeddings(train_prep, 'type', 'type_embeddings')

In [26]:
paint_embd = generate_embeddings(train_prep, 'paint_color', 'paint_embeddings')

In [27]:
train_prep['cylinders'] = train_prep['cylinders'].apply(preprocess_text)
cylinders_embd = generate_embeddings(train_prep, 'cylinders', 'cylinder_embeddings')

In [44]:
embeddings_df = pd.DataFrame()
embeddings_df = pd.concat([embeddings_df,brand_embd,condition_embd,cylinders_embd,fuel_embd,title_status_embd,transmission_embd,drive_embd,size_embd,type_embd,paint_embd], axis = 1)

In [45]:
embeddings_df.to_csv('embeddings_df.csv', index = False)

In [35]:
train_prep['car_age'] = 2023 - train_prep['year']

In [36]:
train_prep['cylinders'] = train_prep['cylinders'].str.extract(r'(\d+)').astype(float).fillna(-1)

In [37]:
condition_mapping = {'excellent': 5, 'like new': 4, 'new': 3, 'good':2, 'fair': 1, 'salvage':0}
train_prep['condition'] = train_prep['condition'].map(condition_mapping)

In [38]:
train_prep['gas_fueled'] = train_prep['fuel'].apply(lambda x: 1 if x == 'gas' else 0)
train_prep['diesel_fueled'] = train_prep['fuel'].apply(lambda x: 1 if x == 'diesel' else 0)
train_prep['hybrid_fueled'] = train_prep['fuel'].apply(lambda x: 1 if x == 'hybrid' else 0)
train_prep['electric_fueled'] = train_prep['fuel'].apply(lambda x: 1 if x == 'electric' else 0)

In [39]:
train_prep['front_wheel_drive'] = train_prep['drive'].apply(lambda x: 1 if x == 'fwd' else 0)
train_prep['four_wheel_drive'] = train_prep['drive'].apply(lambda x: 1 if x == '4wd' else 0)
train_prep['rear_wheel_drive'] = train_prep['drive'].apply(lambda x: 1 if x == 'rwd' else 0)

In [40]:
train_prep['automatic_transmission'] = train_prep['transmission'].apply(lambda x: 1 if x.lower() == 'automatic' else 0)
train_prep['manual_transmission'] = train_prep['transmission'].apply(lambda x: 1 if x.lower() == 'manual' else 0)

In [56]:
train_prep[['size']]

Unnamed: 0,size
0,midsize
1,fullsize
2,fullsize
3,midsize
4,midsize
...,...
27527,compact
27528,fullsize
27529,midsize
27530,midsize


In [31]:
train_prep['type'].isnull().sum()

456

In [42]:
train_prep['type'].value_counts()

type
sedan          9259
SUV            7311
truck          3837
coupe          2255
pickup         2118
hatchback       665
van             537
convertible     418
mini-van        229
wagon           210
other           139
offroad          79
bus              19
Name: count, dtype: int64

In [59]:
paint_embd

In [58]:
embeddings_df

Unnamed: 0,brand_embeddings_1,brand_embeddings_2
0,5.665786,5.665786
1,5.723587,5.723587
2,5.523724,5.523724
3,5.523724,5.523724
4,5.523724,5.523724
...,...,...
27527,5.523724,5.523724
27528,5.523724,5.523724
27529,5.884892,5.884892
27530,5.884892,5.884892
