# Flow 2:

## TF-IDF, Manual Feature Engineering, Heterogeneity of Data Distribution

# Data Loading

In [3]:
import pandas as pd

In [6]:
df = pd.read_csv('../divar_reduced_posts.csv').drop(columns=['Unnamed: 0'])

In [7]:
df

Unnamed: 0,archive_by_user,brand,cat1,city,created_at,desc,id,image_count,mileage,platform,price,title,type,year
0,False,,for-the-home,Tehran,Tuesday 07PM,کلاسیک و شیک و استثنایی\nچرم مالزی\nچوب راش\nف...,54761638662241,5,,mobile,3850000,ست مبلمان و نهارخوری ٩ نفره,,
1,False,,for-the-home,Mashhad,Tuesday 07PM,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",42727631379133,1,,mobile,30000,گلدون مصنوعی نخل,,
2,False,,vehicles,Mashhad,Tuesday 07PM,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,63194439667302,4,,mobile,-1,لودر کاتر پیلار 950,,
3,False,,for-the-home,Tehran,Tuesday 07PM,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,19133025491169,4,,mobile,600000,مبل راحتی هفت نفره بامیز جلو مبلی,,
4,False,,personal,Karaj,Tuesday 08PM,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,58998998335444,2,,mobile,450000,ماشین شارژی,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,,personal,Tehran,Wednesday 07PM,اصلا پوشیده نشده نو هستش,21063397838341,4,,mobile,55000,مانتو مجلسی قرمز سایز44 46,women,
9996,False,,personal,Ahvaz,Wednesday 08PM,لباس تمام گیپور. تو عکس مشخصه تمیز.فقط ۱بار تن...,42819975080363,3,,mobile,70000,لباس مجلسییییی شیک,women,
9997,False,Huawei::هوآوی,electronic-devices,Karaj,Thursday 04AM,نونو همراه با جبه با همه چی,7494598410644,3,,mobile,400000,HUAWEI g6,,
9998,False,,for-the-home,Tehran,Thursday 04AM,باز هم اجناس دیگه هم هست اگه دوست داشتید به تل...,24182403631548,1,,mobile,41000,نمکپاش و قندان,,


In [8]:
df.shape

(10000, 14)

# Data Encoding

### Encoding with TF-IDF

In [10]:
from hazm import *
import codecs


normalizer = Normalizer()
stopwords = [normalizer.normalize(x.strip()) for x in codecs.open(
    '../stopwords.txt', 'r', 'utf-8').readlines()]
stopwords.extend(['تماس', 'تلگرام', 'واتساپ', 'با سلام', 'سلام', 'فروش', 'فوری',
                 'قیمت', 'مقطوع', 'تخفیف', 'نو', 'در حد نو', 'خرید', 'اقساطی', 'تحویل', 'اقساط'])

In [11]:
from tqdm import tqdm


for idx in tqdm(range(df.shape[0])):
    title_sent = df['title'][idx].replace('$NUM' , ' ')
    desc_sent = df['desc'][idx].replace('$NUM' , ' ')
    
    df.loc[idx, 'desc'] = '  '.join([str(elem) for elem in [t for t in word_tokenize(desc_sent) if t not in stopwords]])
    df.loc[idx, 'title'] = '  '.join([str(elem) for elem in [t for t in word_tokenize(title_sent) if t not in stopwords]])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 2626.36it/s]


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from hazm import Normalizer, word_tokenize


def vectorize_with_tf_idf(dataframe):
    normalizer = Normalizer()

    title_tfidf = TfidfVectorizer(
        ngram_range=(1, 1), norm='l2', preprocessor=normalizer.normalize, tokenizer=word_tokenize
    )
    title_tfidf_matrix = title_tfidf.fit_transform(df['title'].values)
#     print(title_tfidf.vocabulary_)
#     print(len(title_tfidf.vocabulary_))


    desc_tfidf = TfidfVectorizer(
        ngram_range=(1, 1), norm='l2', preprocessor=normalizer.normalize, tokenizer=word_tokenize
    )
    desc_tfidf_matrix = desc_tfidf.fit_transform(df['desc'].values)
#     print(desc_tfidf.vocabulary_)
#     print(len(desc_tfidf.vocabulary_))
    
    return title_tfidf_matrix, desc_tfidf_matrix

In [13]:
title_tfidf_matrix, desc_tfidf_matrix = vectorize_with_tf_idf(df)



In [14]:
print('title vector shape:', title_tfidf_matrix.shape)
print('desc vector shape:', desc_tfidf_matrix.shape)

title vector shape: (10000, 6883)
desc vector shape: (10000, 16440)


# Feature Engineering

### Manually select most frequent words

In this section, we extract the words with most tfidf values, because we cannot use the whole matrix.

In [15]:
desc_tfidf_df = pd.DataFrame(desc_tfidf_matrix.toarray())

words_tfidf_sums = []
for col in desc_tfidf_df.columns:
    words_tfidf_sums.append((col, desc_tfidf_df[col].sum()))

sorted_tfidf_desc = sorted(words_tfidf_sums, key=lambda x: x[1])[-100:]
selected_tfidf_desc = desc_tfidf_df[[col[0] for col in sorted_tfidf_desc]]

selected_tfidf_desc.columns = [str(i) for i in range(0,100)]

In [34]:
selected_tfidf_desc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.126748
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.167239,0.0,0.0,0.0,0.0,0.000000,0.107378
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.098232
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.385534,0.0,0.0,0.0,0.0,0.103171,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.103892,0.000000
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000


In [16]:
title_tfidf_df = pd.DataFrame(title_tfidf_matrix.toarray())
words_tfidf_sums = []

for col in title_tfidf_df.columns:
    words_tfidf_sums.append((col, title_tfidf_df[col].sum()))

sorted_tfidf_title = sorted(words_tfidf_sums, key=lambda x: x[1])[-100:]
selected_tfidf_titles = title_tfidf_df[[col[0] for col in sorted_tfidf_title]]

selected_tfidf_titles.columns = [str(i) for i in range(100,200)]

In [33]:
selected_tfidf_titles

Unnamed: 0,100,101,102,103,104,105,106,107,108,109,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0.0,0.41962,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.319933,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.352799,0.000000,0.0,0.0,0.327483,0.0
4,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.292641,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
9996,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.439559,0.0,0.0,0.000000,0.0
9997,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0
9998,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0


### Manually Remove unnecessary columns

As we show the number of NaN values in brand, type, year, and mileage columns, we can remove these columns. Because more than 80 percent of them are NaN. 

And we can also remove platform columns, because it connot help us in prediction.

In [17]:
def remove_unnecessary_features(dataframe):
    selected_features = ['title', 'desc', 'image_count', 'price', 'cat1']
    dataframe = dataframe[selected_features]
    dataframe.rename(columns={'cat1': 'category'}, inplace=True)
    return dataframe

In [18]:
df = remove_unnecessary_features(df)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.rename(columns={'cat1': 'category'}, inplace=True)


Unnamed: 0,title,desc,image_count,price,category
0,ست مبلمان نهارخوری ٩ نفره,کلاسیک استثنایی چرم مالزی چوب راش العاده...,5,3850000,for-the-home
1,گلدون مصنوعی نخل,"سلام,یک عدد گلدون نخل سه طبقه سالم پایه...",1,30000,for-the-home
2,لودر کاتر پیلار 950,سریال 43 j شکستگی سه حلقه لاستیک حلقه ...,4,-1,vehicles
3,مبل راحتی نفره بامیز مبلی,مبل راحتی نفره شامل سه نفره عدد دونفره ...,4,600000,for-the-home
4,ماشین شارژی,شارژی کنترلی سویچ حمل 35 کیلو صندلی نفر...,2,450000,personal
...,...,...,...,...,...
9995,مانتو مجلسی قرمز سایز 44 46,پوشیده,4,55000,personal
9996,لباس مجلسییییی,لباس گیپور عکس مشخصه تمیز ۱بار تن همراه...,3,70000,personal
9997,HUAWEI g6,نونو همراه جبه,3,400000,electronic-devices
9998,نمکپاش قندان,اجناس دوست بپیوندید,1,41000,for-the-home


### New dataset

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_price = pd.DataFrame(scaler.fit_transform(df[['price']]), columns=['price'])
df = pd.concat([df, pd.DataFrame(selected_tfidf_titles)], axis=1)
df = pd.concat([df, pd.DataFrame(selected_tfidf_desc)], axis=1).drop(columns=['title', 'desc'])
df.columns = df.columns.astype(str)

df

Unnamed: 0,image_count,price,category,100,101,102,103,104,105,106,...,90,91,92,93,94,95,96,97,98,99
0,5,3850000,for-the-home,0.0,0.0,0.0,0.41962,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.126748
1,1,30000,for-the-home,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.167239,0.0,0.0,0.0,0.0,0.000000,0.107378
2,4,-1,vehicles,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.098232
3,4,600000,for-the-home,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.385534,0.0,0.0,0.0,0.0,0.103171,0.000000
4,2,450000,personal,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,4,55000,personal,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
9996,3,70000,personal,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.103892,0.000000
9997,3,400000,electronic-devices,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
9998,1,41000,for-the-home,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000


In [20]:
df['price'] = scaled_price

In [21]:
df

Unnamed: 0,image_count,price,category,100,101,102,103,104,105,106,...,90,91,92,93,94,95,96,97,98,99
0,5,0.016651,for-the-home,0.0,0.0,0.0,0.41962,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.126748
1,1,-0.239945,for-the-home,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.167239,0.0,0.0,0.0,0.0,0.000000,0.107378
2,4,-0.241960,vehicles,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.098232
3,4,-0.201657,for-the-home,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.385534,0.0,0.0,0.0,0.0,0.103171,0.000000
4,2,-0.211733,personal,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,4,-0.238266,personal,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
9996,3,-0.237258,personal,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.103892,0.000000
9997,3,-0.215091,electronic-devices,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
9998,1,-0.239206,for-the-home,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000


In [22]:
df.to_csv('exp2-tfidf-manual-heterogeneous.csv')

# Logistic Regression

In [53]:
categories_count

category
for-the-home          3044
vehicles              2208
electronic-devices    1738
personal              1525
leisure-hobbies        849
businesses             636
Name: count, dtype: int64

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train, test = train_test_split(df, test_size=0.2)

X_train = train.drop(columns=['category'])
y_train = train['category']

X_test = test.drop(columns=['category'])
y_test = test['category']

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

preds = clf.predict(X_test)
print(accuracy_score(preds, y_test))

0.726


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
