In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
#from textblob import TextBlob
from googletrans import Translator
from analysis_helpers import get_correlations
import simplemma

In [2]:
# Read the preprocessed dataset
df = pd.read_parquet('../data/used_car_dataset.parquet')
df.head()

Unnamed: 0,price (HUF),condition,design,vehicle history,financing,clock position (km),shippable persons number,number of doors,color,own weight (kg),...,Take away from 25%,pedestrian airbag,electrically adjustable headrests,home mains charger,anti-theft,towing hook - electrically foldable,amplifier,suede upholstery,type2 charging cable,description
0,6895000,Kitűnő,Sedan,Elérhető autó-előélet alvázszám alapján,25%-tól elvihető,110000,5.0,4.0,Fekete (metál),1426.0,...,True,False,False,False,False,False,False,False,False,leírás\nelső tulajdonostól érkezett hozzánk ez...
1,9999999,Megkímélt,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,20%-tól elvihető,119300,5.0,5.0,Fehér,2150.0,...,False,False,False,False,True,False,False,False,False,"leírás\nszép állapotú, 119 300 kilométert futo..."
2,4995000,Kitűnő,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,20%-tól elvihető,70000,5.0,5.0,Kék,1200.0,...,False,False,False,False,False,False,False,False,False,leírás\nelső tulajdonostól érkezett hozzánk! t...
3,3499000,Kitűnő,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,,100000,5.0,5.0,Sötétkék,1176.0,...,False,False,False,False,False,False,False,False,False,"leírás\neladó egy új korában magyarországon, a..."
4,7990000,Normál,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,20%-tól elvihető,74760,5.0,5.0,Fehér,1470.0,...,False,False,False,False,False,False,False,False,False,


In [3]:
cols = df.columns
', '.join(cols)



### Get car brand

In [4]:
# Add brand from link
df['brand'] = [x[4] for x in df.link.str.split('/')]
df['brand'].head()

0       volkswagen
1    mercedes-benz
2            dacia
3          renault
4    mercedes-benz
Name: brand, dtype: object

In [5]:
# Color contains 2 peaces of inprmation the exact color and the is the car has metallic polish or not
df['metallic_polish'] = df['color'].str.contains('metál')
df['color'] = df['color'].str.replace(' (metál)', '', regex=False).str.lower()

### Extract the city from the address

If the car is advertised by a shop the first contact information is the name of the shop. If the car advertised by an individual then the first one is the address. If wh have the address then we can extract the city name from it. Luckily we have a csv with all the hungarian city names so we can find them in the addresses even if the format of the address is not uniform so we would struggle with writing a regex from this

In [6]:
df_hun_sattlements = pd.read_csv('all_hun_settlement.csv')

df_hun_sattlements['cleaned_settlement'] = df_hun_sattlements['settlement'].str.replace(r'Budapest.+', 'Budapest').str.lower().str.strip()
#df_hun_sattlements.drop_duplicates('cleand_settlement', inplace=True)

df_hun_sattlements.head()

  df_hun_sattlements['cleaned_settlement'] = df_hun_sattlements['settlement'].str.replace(r'Budapest.+', 'Budapest').str.lower().str.strip()


Unnamed: 0,zip,settlement,short,latitude,longitude,cleaned_settlement
0,1011,Budapest I. kerület,BU,47.5011,19.0469,budapest
1,1012,Budapest I. kerület,BU,47.5,19.0833,budapest
2,1013,Budapest I. kerület,BU,47.4961,19.0309,budapest
3,1014,Budapest I. kerület,BU,47.5034,19.0349,budapest
4,1015,Budapest I. kerület,BU,47.5,19.0833,budapest


In [7]:
def get_city_from_zip(_zip):
    city = df_hun_sattlements[df_hun_sattlements.zip == _zip]['cleaned_settlement']
    if len(city) > 0:
        return city.values[0]

In [8]:
msk_is_salse_shop = df['buy_from_shop']
# If the car is advertised by a shop the first contact information is the name of the shop
# If the car advertised by an individual then the first one is the address
# First get the addresses in 1 column
address = pd.Series(np.where(msk_is_salse_shop, df.content_info_1, df.content_info_0))

# Convert it to lower case since we converted all the city names as well
address = address.str.lower()

# Remove "looking in map" additinal text that come form the scraping not part of the address
address = address.str.replace('térkép megtekintése', '', regex=False)

zip_code = address.str.replace('[^\d ]', '', regex=True)
zip_code = zip_code.str.extract("( \d\d\d\d )")[0].astype(float)
cities_based_on_zip = [
    get_city_from_zip(x) for x in zip_code
]

address_without_numbers = address.str.replace("[^a-záéíóöőúüű\s]", "", regex=True)

# Extract the city name from each address
df['city'] = address_without_numbers.str.extract(f"( {' | '.join(df_hun_sattlements['cleaned_settlement'].unique())} )")
df['city'] = df['city'].str.strip()

df['city'] = np.where(df['city'].isna(), cities_based_on_zip, df['city'])

In [9]:
# A lot of advertisemnt don't show address
df['city'].isna().mean()

0.2796135324226913

### Get take away discount values

In [10]:
# Take aways from X%
# Drop taken away from variable
df['financing'] = df['financing'].str.replace(r'\D', '', regex=True).replace({'':'100'}).astype(int)

In [11]:
df['financing'].value_counts()

100    24637
20      4976
30      1398
25      1222
0        453
40       155
35        85
50        51
10        26
15         8
45         6
Name: financing, dtype: int64

In [12]:
# Drop taken aw columsn to remove duplicated information
cols[cols.str.lower().str.contains('take')]

Index(['Take away from 20%', 'Can be taken away from 30%',
       'Take away from 25%'],
      dtype='object')

In [13]:
# Use years instead of days becuase this high resulution of the age doeasn't really matter on the other hand makes if harder to biuld model
df['age_year'] = (df['age_days']/365).astype(int)

### Get freatures from description# Remuve unneccessary spaces

(1. **Sentiment Analysis:**  **It would work only with english text! Do translation before!** You can perform sentiment analysis on the description to see if it's generally positive, negative, or neutral. This could be interesting to see if there's any correlation between the sentiment of the description and other variables.)

2. **Text Length:** The length of the description could be indicative of something. For instance, sellers who are more serious might write more detailed descriptions. You can easily calculate this with Python's `len()` function.

3. **Word Count:** Similar to text length, but this might give different results because it doesn't count characters but words. A higher word count might indicate a more complex description.

4. **Count of Specific Words:** You can count the occurrence of specific words that you think might be important. For example, if you're looking at car listings, words like "new", "used", "clean", "serviced" etc., might be of interest.

5. **Count of Capitalized Words:** Sellers might capitalize certain words to draw attention to them. Counting the number of capitalized words could be another feature.

6. **TF-IDF Scores:** This is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. Words that are used frequently in a single document but not frequently in the corpus receive a higher TF-IDF score.

In [14]:
# Remuve unneccessary spaces
df['description'] = df['description'].str.strip() 
# Remove start word which means description
df['description'] = df['description'].str.replace(r'leírás\n','', regex=True)


In [15]:
df['description_length'] = df['description'].apply(len)
df['word_count'] = df['description'].apply(lambda x: len(x.split()))


# Sentiment analyses
# TODO: Translatie description
# df['description_polarity'] = df['description'].apply(lambda x: TextBlob(x).sentiment.polarity)
# df['description_subjectivity'] = df['description'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [16]:
def lemmatize_text(text):
    mytokens = text.split()
    lemmetized_tokens = [simplemma.lemmatize(x, lang='hu') for x in mytokens]
    
    return ' '.join(lemmetized_tokens)

In [17]:
# Remove non-alpahbetic characters
df['description_lemmatized'] = df['description'].str.replace('[^a-záéíóöőúüű\s]', '', regex=True)

# Lemmatize
df['description_lemmatized'] = [lemmatize_text(x) for x in df['description_lemmatized']]

# Manula lemmatization for specail cases
word_map = {'cserél':'csere', 'garantál':'garancia', 
            'állapotú':'állapot', 'állap':'állapot', 'állapotot':'állapot', 'használat':'használ', 
            'ülése':'ülés', 'elektromosan':'elektromos', 'magyarországi':'magyarország',
            'megtekinthető':'megtekintés', 'ülése':'ülés', 'km':'kilométer',
            'gépjármű':'gépkocsi', 'autó':'gépkocsi'}

for k,v in word_map.items():
    df['description_lemmatized'] = df['description_lemmatized'].str.replace(k, v)


In [18]:
# 'hungarian_stop_words' is a list of stop words in Hungarian
with open('stopwords-hu.txt', 'r', encoding='utf-8') as f:
    hungarian_stop_words = [line.strip() for line in f]

In [19]:
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=hungarian_stop_words, max_features=90)

# Apply the vectorizer
X = vectorizer.fit_transform(df['description_lemmatized'])

# Translate to inglis the feature names
translator = Translator()
translations = translator.translate(list(vectorizer.get_feature_names_out()), src='hu', dest='en')

# Create a dataframe from the features
tfidf_df = pd.DataFrame(X.toarray(), columns=['tfidf_'+x.text for x in translations])



In [20]:
tfidf_df.rename(columns={'tfidf_sports':'tfidf_sport'}, inplace=True)

In [21]:
vectorizer.get_feature_names_out()

array(['ablak', 'ajánlattétel', 'alváz', 'automata', 'belső', 'beszámít',
       'biztosítás', 'bmw', 'bőr', 'csere', 'csomag', 'db', 'digitális',
       'egyeztetés', 'elad', 'elektromos', 'elvihető', 'elöl', 'es',
       'eset', 'esztétika', 'extra', 'felszereltség', 'forgalom', 'friss',
       'fényszóró', 'fűthető', 'garancia', 'gumi', 'gyári', 'gépkocsi',
       'használ', 'hirdetés', 'hitel', 'hátsó', 'időpont', 'ig', 'jármű',
       'karbantart', 'kerül', 'kilométer', 'klíma', 'kormány', 'kulcs',
       'kér', 'köszön', 'külső', 'led', 'lehetséges', 'leinformálható',
       'légzsák', 'magyarország', 'megbízható', 'megkímélt',
       'megtekintés', 'minősül', 'motor', 'műszaki', 'navigáció', 'nyári',
       'olaj', 'rendelkezik', 'rendszer', 'rendszeres', 'sport', 'stb',
       'személyes', 'szerel', 'szerviz', 'szép', 'tart', 'tartalmaz',
       'telefon', 'tempomat', 'tud', 'tulajdonos', 'téli', 'tükör',
       'vezet', 'vált', 'vár', 'vásárlás', 'vásárol', 'vég', 'állapot',
 

In [22]:
tfidf_df.columns.value_counts().head(10)

tfidf_window        1
tfidf_repair        1
tfidf_etc           1
tfidf_sport         1
tfidf_regular       1
tfidf_system        1
tfidf_has           1
tfidf_oil           1
tfidf_summer        1
tfidf_navigation    1
dtype: int64

In [23]:
# Join the new features to the original dataframe
df2 = pd.concat([df, tfidf_df], axis=1)

df2.drop(
    columns=[
     'content_info_0',
     'content_info_1',
     'content_info_2',
     'content_info_3',
     'content_info_4',
     'content_info_5',
     'upholstery color (1)',
     'upholstery color (2)',
     'content_info_6',
    'Take away from 20%',
    'Take away from 25%',
    'Can be taken away from 30%',
    'age_days',
    'description',
    'description_lemmatized'
    ],
    inplace=True
)


In [24]:
df2.head()

Unnamed: 0,price (HUF),condition,design,vehicle history,financing,clock position (km),shippable persons number,number of doors,color,own weight (kg),...,tfidf_Castle,tfidf_buying,tfidf_buys,tfidf_end,tfidf_condition,tfidf_adjustable,tfidf_price,tfidf_inquire,tfidf_administration,tfidf_seat
0,6895000,Kitűnő,Sedan,Elérhető autó-előélet alvázszám alapján,25,110000,5.0,4.0,fekete,1426.0,...,0.0,0.0,0.0,0.0,0.133185,0.0,0.0,0.0,0.222271,0.0
1,9999999,Megkímélt,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,20,119300,5.0,5.0,fehér,2150.0,...,0.0,0.225378,0.0,0.0,0.255111,0.0,0.0,0.0,0.0,0.0
2,4995000,Kitűnő,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,20,70000,5.0,5.0,kék,1200.0,...,0.0,0.0,0.0,0.0,0.150843,0.0,0.0,0.0,0.251739,0.0
3,3499000,Kitűnő,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,100,100000,5.0,5.0,sötétkék,1176.0,...,0.0,0.127754,0.126606,0.243211,0.144609,0.146088,0.0,0.0,0.120668,0.0
4,7990000,Normál,Városi terepjáró (crossover),Elérhető autó-előélet alvázszám alapján,20,74760,5.0,5.0,fehér,1470.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df2.set_index('link', inplace=True)

In [26]:
df_melt_corr = get_correlations(df2)
df_melt_corr

Unnamed: 0,variable1,variable2,correlation
2029,back winter tires rim diameter,winter tires rim diameter,0.997900
1672,back summer tires rim diameter,summer tires rim diameter,0.997234
3095,description_length,word_count,0.989374
1910,back winter tires apect ratio,winter tires apect ratio,0.972543
1553,back summer tires apect ratio,summer tires apect ratio,0.971512
...,...,...,...
1552,back summer tire width,summer tires apect ratio,-0.717508
1554,back summer tires rim diameter,summer tires apect ratio,-0.745287
2143,back summer tire width,back summer tires apect ratio,-0.755232
1671,back summer tires apect ratio,summer tires rim diameter,-0.757933


In [27]:
df_high_corr = df_melt_corr[df_melt_corr.correlation > 0.96]
df_high_corr

Unnamed: 0,variable1,variable2,correlation
2029,back winter tires rim diameter,winter tires rim diameter,0.9979
1672,back summer tires rim diameter,summer tires rim diameter,0.997234
3095,description_length,word_count,0.989374
1910,back winter tires apect ratio,winter tires apect ratio,0.972543
1553,back summer tires apect ratio,summer tires apect ratio,0.971512


In [28]:
high_corr_values = df_high_corr['variable1'].values
high_corr_values

array(['back winter tires rim diameter', 'back summer tires rim diameter',
       'description_length', 'back winter tires apect ratio',
       'back summer tires apect ratio'], dtype=object)

In [29]:
df_melt_corr[df_melt_corr.variable1 == 'price (HUF)']

Unnamed: 0,variable1,variable2,correlation
14,price (HUF),summer tires rim diameter,0.641961
17,price (HUF),winter tires rim diameter,0.629429
12,price (HUF),summer tire width,0.576964
15,price (HUF),winter tire width,0.551881
6,price (HUF),total weight (kg),0.527645
...,...,...,...
42,price (HUF),tfidf_sell,-0.128260
37,price (HUF),tfidf_exchange,-0.146085
85,price (HUF),tfidf_technical,-0.158143
16,price (HUF),winter tires apect ratio,-0.379935


In [30]:
df_melt_corr[df_melt_corr.variable2 == 'price (HUF)']

Unnamed: 0,variable1,variable2,correlation
11,initial part,price (HUF),0.956241
9,performance (kW),price (HUF),0.704216
23,back winter tires rim diameter,price (HUF),0.692303
20,back summer tires rim diameter,price (HUF),0.681067
18,back summer tire width,price (HUF),0.657214
21,back winter tire width,price (HUF),0.655488
5,own weight (kg),price (HUF),0.600747
8,cylinder capacity (cm3),price (HUF),0.445689
10,MOT is valid (days),price (HUF),0.255199
26,description_length,price (HUF),0.197121


In [31]:
# Drop 'inital part' since it almost identical to price
df2.drop(columns=['initial part'] +list(high_corr_values), inplace=True, errors='ignore')

In [32]:
# There are some extramely low prices where the price is written by mistace drop them
df2 = df2[df2['price (HUF)'] > 100000]

In [33]:
df2.to_parquet('../data/used_car_engineered_data.parquet')

In [34]:
df_hun_sattlements.to_csv('all_hun_sattlement.csv', index=False)

In [35]:
# Save descriptions to create a word cloud in EDA
text = '\n'.join(df['description'].dropna())

In [36]:
with open('descriptions.txt', 'w') as file:
    file.write(text)

In [44]:
df[tfidf_df['tfidf_sell']>0]['description'].values[:10]

array(['eladó egy új korában magyarországon, a renault miskolc-ban-ban vásárolt, és végig ott szervizelt, végig egy tulajdonos által használt, mindössze 110000 km-t futott, 90 lóerős, benzines, renault captur 0.9 tce energy alize euro 6. az autó kifogástalan műszaki állapotú, légkondicionálóval és rendeltetésszerűen működő egyéb extrafelszerelésekkel(tempomat\u200b, usb csatlakozós multimédia, elektromosan állítható-, fűthető visszapillantó tükrök, első-hátsó elektromos ablakemelők, tologatható hátsó üléssor, menetstabilizátor(esp), kipörgésgátló(asr), légzsákok(vezető, utas, oldal). a beltér tiszta, állapota tükrözi az alacsony futásteljesítményt. járműszolgáltatási platform-os lekérdezése, szervizszámlái megtekinthetőek a dokumentum fül alatt. mindkét gyári kulcskártyájával és az összes kezelési útmutatóval rendelkezik. a műszaki vizsga még több mint másfél évig érvényes. a gépkocsira egy év(igény szerint akár három) műszaki garanciát adunk. eladására új autó vásárlása miatt került s