In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data/df_cleaned.csv', index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318994 entries, 0 to 458210
Columns: 352 entries, price to state_west
dtypes: float64(4), int64(347), object(1)
memory usage: 859.1+ MB


In [4]:
df.head()

Unnamed: 0,price,year,cylinders,odometer,description,lat,long,posting_weekday,posting_month,posting_year,...,car_model_volkswagen passat,car_model_volkswagen tiguan,car_model_volvo s60,car_model_volvo xc60,car_model_volvo xc70,car_model_volvo xc90,state_midwest,state_northeast,state_south,state_west
0,35990,2010.0,8,32742.0,Carvana is the safer way to buy a car During t...,32.59,-85.48,2,12,2020,...,0,0,0,0,0,0,0,0,1,0
1,7500,2014.0,4,93600.0,I'll move to another city and try to sell my c...,32.5475,-85.4682,2,12,2020,...,0,0,0,0,0,0,0,0,1,0
2,4900,2006.0,6,87046.0,Clean 2006 BMW X3 3.0I. Beautiful and rare Bl...,32.616807,-85.464149,2,12,2020,...,0,0,0,0,0,0,0,0,1,0
4,19500,2005.0,8,116000.0,2005 Ford F350 Lariat (Bullet Proofed). This t...,32.5475,-85.4682,1,12,2020,...,0,0,0,0,0,0,0,0,1,0
5,29590,2016.0,6,33290.0,Carvana is the safer way to buy a car During t...,32.59,-85.48,1,12,2020,...,0,0,0,0,0,0,0,0,1,0


In [5]:
from sklearn.model_selection import train_test_split

#Spliting the data in train and test
X = df.drop('price', axis=1)
y=df['price']

#Train and test split with split size of 25% for training data
split = 0.20
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=split, random_state=35)

print(X_tr.shape, y_tr.shape)
print(X_test.shape, y_test.shape)

(255195, 351) (255195,)
(63799, 351) (63799,)


In [6]:
# Further splitting data in train and validation

#Train and test split with split size of 25% for training data
split = 0.20
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=split, random_state=35)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(204156, 351) (204156,)
(51039, 351) (51039,)


In [7]:
#saving train test data before encoding

X_train.to_csv('data/X_train.csv')
X_val.to_csv('data/X_val.csv')
X_test.to_csv('data/X_test.csv')
y_train.to_csv('data/y_train.csv')
y_val.to_csv('data/y_val.csv')
y_test.to_csv('data/y_test.csv')

In [7]:
index = X_train.columns

#### Initially we will use nltk, Tfidf to vectorize description column to start with (later part would use sentence encoders and other NLP libraries)

In [8]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zesha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zesha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zesha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

def preprocess(text):
    stop_words =stopwords.words('english')
    
    # Split whitespace
    text=text.split(' ')
    
    # Lowercase
    text = [word.lower() for word in text]
    
    # Remove punctuation - 
    #  translate table functions by firstly removing translation table which will replace punctuations with empty strings. By calling .translate(), 
    #  function will be applied on character level which will capture cases where puncutation is joined with a word
    punc_table = str.maketrans('', '', string.punctuation) #translation table
    text = [word.translate(punc_table) for word in text]
    
    #Stop Words removal
    text=[word for word in text if word not in stop_words]
    
    #LEMMATIZE
    #  Intantiate
    lemmatizer = WordNetLemmatizer()
    
    #Part-of-speech constants
    #Loop is run and lemmatize will take two arguments one is token and other is a mapping of pos_tag with wordnet value. 
    #  Verb lemmatizing
    text = [lemmatizer.lemmatize(word, pos='v') for word in text]
    #  Noun lemmatizing
    text = [lemmatizer.lemmatize(word, pos='n') for word in text]
    #  Adjective lemmatizing
    text = [lemmatizer.lemmatize(word, pos='a') for word in text]
    #text = [lemmatizer.lemmatize(word, pos='j') for word in text]
    #  Adverb lemmatizing
    text = [lemmatizer.lemmatize(word, pos='r') for word in text]
    #ADJ_SAT
    text = [lemmatizer.lemmatize(word, pos='s') for word in text]
    
    
    return text

Lemmatization is preferred over the Stemming 

Stemming algorithm works by cutting the suffix from the word. In a broader sense cuts either the beginning or end of the word.

On the contrary, Lemmatization is a more powerful operation, and it takes into consideration morphological analysis of the words. It returns the lemma which is the base form of all its inflectional forms. In-depth linguistic knowledge is required to create dictionaries and look for the proper form of the word. Stemming is a general operation while lemmatization is an intelligent operation where the proper form will be looked in the dictionary. Hence, lemmatization helps in forming better machine learning features. 

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Instantiate vectorizer
tfidf = TfidfVectorizer(tokenizer=preprocess, min_df=0.1, max_df=0.6)

#Fit Veectorizers
#tfidf = tfidf.fit(df['description'])
X_train_tfidf = tfidf.fit(X_train['description'])

#Transform
#df_des = tfidf.transform(df['description'])
X_train_id = X_train_tfidf.transform(X_train['description'])
X_test_id = X_train_tfidf.transform(X_test['description'])
X_val_id = X_train_tfidf.transform(X_val['description'])
#tfidf_train = tfidf.transform(X_train['description'])
#tfidf_test = tfidf.transform(X_test['description'])

In [None]:
#X_train_idf = X_train_id.copy()
#X_test_idf = X_test_id.copy()
#X_vel_idf = 

In [None]:
#X_vel_idf = 

In [15]:
col_names = ["d_"+column for column in X_train_tfidf.get_feature_names()]

In [16]:
X_train_id = pd.DataFrame(X_train_id.todense(), columns=col_names)

In [17]:
X_train_id.shape

(204156, 444)

In [18]:
X_test_id = pd.DataFrame(X_test_id.todense(), columns=col_names)
X_test_id.shape

(63799, 444)

In [19]:
X_val_id = pd.DataFrame(X_val_id.todense(), columns=col_names)
X_val_id.shape

(51039, 444)

In [22]:
(X_val.isna().mean()*100.0).sort_values(ascending=False)

state_west                       0.0
car_model_chevrolet nan          0.0
car_model_chevrolet corvette     0.0
car_model_chevrolet cruze        0.0
car_model_chevrolet equinox      0.0
                                ... 
car_model_lexus ls               0.0
car_model_lexus rx               0.0
car_model_lincoln continental    0.0
car_model_lincoln mkc            0.0
year                             0.0
Length: 351, dtype: float64

In [25]:
X_val_id.head()

Unnamed: 0,d_00,d_01,d_02,d_03,d_04,d_05,d_06,d_07,d_08,d_09,...,d_wheel,d_white,d_window,d_wiper,d_without,d_won’t,d_work,d_year,d_youdisclaimer,d_✅
0,0.046727,0.04657,0.04641,0.046276,0.046186,0.045945,0.045816,0.045671,0.045413,0.045424,...,0.0,0.0,0.0,0.0,0.075496,0.049316,0.03456,0.0,0.049584,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.180568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.137737,0.0,0.042267,0.0,0.0,0.0,0.086605,0.046066,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.102461,0.0,0.041922,0.0,0.046912,0.0,0.0859,0.045691,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.13732,0.438253,0.0,0.0


In [29]:
X_train_id = X_train_id.reset_index()
X_test_id = X_test_id.reset_index()
X_val_id = X_val_id.reset_index()

In [30]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()
X_val = X_val.reset_index()

In [31]:
#dropping description column
X_train.drop(['description'], axis = 1, inplace = True)
X_test.drop(['description'], axis = 1, inplace = True)
X_val.drop(['description'], axis = 1, inplace = True)

In [32]:
X_train_tfidf = pd.concat([X_train, X_train_id], axis=1)

In [33]:
X_test_tfidf = pd.concat([X_test, X_test_id], axis=1) 

In [34]:
X_val_tfidf = pd.concat([X_val, X_val_id], axis=1) 

In [35]:
X_train_tfidf.head()

Unnamed: 0,index,year,cylinders,odometer,lat,long,posting_weekday,posting_month,posting_year,condition_excellent,...,d_wheel,d_white,d_window,d_wiper,d_without,d_won’t,d_work,d_year,d_youdisclaimer,d_✅
0,322098,2017.0,8,74863.0,45.519861,-122.434111,3,11,2020,0,...,0.067461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,344532,2019.0,4,4142.0,41.4,-75.66,1,11,2020,0,...,0.0,0.0,0.0,0.0,0.059981,0.039182,0.027457,0.0,0.039394,0.0
2,13565,2013.0,4,61674.0,34.62,-112.42,3,11,2020,0,...,0.0,0.0,0.0,0.0,0.075885,0.04957,0.034738,0.0,0.049839,0.0
3,334734,2017.0,6,26451.0,40.27,-76.88,4,11,2020,0,...,0.0,0.0,0.0,0.0,0.075588,0.049377,0.034602,0.0,0.049645,0.0
4,36265,2019.0,4,4161.0,33.779214,-84.411811,1,12,2020,0,...,0.0,0.0,0.0,0.0,0.073911,0.048281,0.033834,0.0,0.048543,0.0


In [36]:
(X_test_tfidf.isna().mean()*100.0).sort_values(ascending=False)

d_✅                               0.0
car_model_mitsubishi outlander    0.0
car_model_mitsubishi lancer       0.0
car_model_mitsubishi eclipse      0.0
car_model_mini hardtop            0.0
                                 ... 
d_easy                            0.0
d_dual                            0.0
d_driver                          0.0
d_drive                           0.0
index                             0.0
Length: 796, dtype: float64

In [37]:
(X_test_tfidf.isna().mean()*100.0).sort_values(ascending=False)

d_✅                               0.0
car_model_mitsubishi outlander    0.0
car_model_mitsubishi lancer       0.0
car_model_mitsubishi eclipse      0.0
car_model_mini hardtop            0.0
                                 ... 
d_easy                            0.0
d_dual                            0.0
d_driver                          0.0
d_drive                           0.0
index                             0.0
Length: 796, dtype: float64

In [39]:
(X_val_tfidf.isna().mean()*100.0).sort_values(ascending=False)

d_✅                               0.0
car_model_mitsubishi outlander    0.0
car_model_mitsubishi lancer       0.0
car_model_mitsubishi eclipse      0.0
car_model_mini hardtop            0.0
                                 ... 
d_easy                            0.0
d_dual                            0.0
d_driver                          0.0
d_drive                           0.0
index                             0.0
Length: 796, dtype: float64

In [40]:
#Copying clean data for next notebook
X_train_tfidf.to_csv('data/X_train_tfidf.csv') 
X_test_tfidf.to_csv('data/X_test_tfidf.csv') 
X_val_tfidf.to_csv('data/X_val_tfidf.csv') 