# Flow 3:

## CountVec, TSVD, Heterogeneity of Data Distribution

# Flow 4

## CountVec, TSVD, Change Loss Function

# Data Loading

In [35]:
import pandas as pd

In [36]:
df = pd.read_csv('../divar_reduced_posts.csv').drop(columns=['Unnamed: 0'])

In [37]:
df

Unnamed: 0,archive_by_user,brand,cat1,city,created_at,desc,id,image_count,mileage,platform,price,title,type,year
0,False,,for-the-home,Tehran,Tuesday 07PM,کلاسیک و شیک و استثنایی\nچرم مالزی\nچوب راش\nف...,54761638662241,5,,mobile,3850000,ست مبلمان و نهارخوری ٩ نفره,,
1,False,,for-the-home,Mashhad,Tuesday 07PM,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",42727631379133,1,,mobile,30000,گلدون مصنوعی نخل,,
2,False,,vehicles,Mashhad,Tuesday 07PM,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,63194439667302,4,,mobile,-1,لودر کاتر پیلار 950,,
3,False,,for-the-home,Tehran,Tuesday 07PM,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,19133025491169,4,,mobile,600000,مبل راحتی هفت نفره بامیز جلو مبلی,,
4,False,,personal,Karaj,Tuesday 08PM,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,58998998335444,2,,mobile,450000,ماشین شارژی,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,,personal,Tehran,Wednesday 07PM,اصلا پوشیده نشده نو هستش,21063397838341,4,,mobile,55000,مانتو مجلسی قرمز سایز44 46,women,
9996,False,,personal,Ahvaz,Wednesday 08PM,لباس تمام گیپور. تو عکس مشخصه تمیز.فقط ۱بار تن...,42819975080363,3,,mobile,70000,لباس مجلسییییی شیک,women,
9997,False,Huawei::هوآوی,electronic-devices,Karaj,Thursday 04AM,نونو همراه با جبه با همه چی,7494598410644,3,,mobile,400000,HUAWEI g6,,
9998,False,,for-the-home,Tehran,Thursday 04AM,باز هم اجناس دیگه هم هست اگه دوست داشتید به تل...,24182403631548,1,,mobile,41000,نمکپاش و قندان,,


In [38]:
df.shape

(10000, 14)

# Data Encoding

### Encoding with CountVec

In [39]:
from hazm import *
import codecs


normalizer = Normalizer()
stopwords = [normalizer.normalize(x.strip()) for x in codecs.open(
    '../stopwords.txt', 'r', 'utf-8').readlines()]
stopwords.extend(['تماس', 'تلگرام', 'واتساپ', 'با سلام', 'سلام', 'فروش', 'فوری',
                 'قیمت', 'مقطوع', 'تخفیف', 'نو', 'در حد نو', 'خرید', 'اقساطی', 'تحویل', 'اقساط'])

In [40]:
from tqdm import tqdm


for idx in tqdm(range(df.shape[0])):
    title_sent = df['title'][idx].replace('$NUM' , ' ')
    desc_sent = df['desc'][idx].replace('$NUM' , ' ')
    
    df.loc[idx, 'desc'] = '  '.join([str(elem) for elem in [t for t in word_tokenize(desc_sent) if t not in stopwords]])
    df.loc[idx, 'title'] = '  '.join([str(elem) for elem in [t for t in word_tokenize(title_sent) if t not in stopwords]])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 2542.61it/s]


In [41]:
from sklearn.feature_extraction.text import CountVectorizer

from hazm import Normalizer, word_tokenize


def vectorize_with_count_vec(dataframe):
    normalizer = Normalizer()

    title_vectorizer = CountVectorizer(
        ngram_range=(1, 1), preprocessor=normalizer.normalize, tokenizer=word_tokenize
    )
    title_matrix = title_vectorizer.fit_transform(df['title'].values)

    desc_vectorizer = CountVectorizer(
        ngram_range=(1, 1), preprocessor=normalizer.normalize, tokenizer=word_tokenize
    )
    desc_matrix = title_vectorizer.fit_transform(df['desc'].values)


    
    return title_matrix, desc_matrix

In [42]:
title_matrix, desc_matrix = vectorize_with_count_vec(df)



In [43]:
print('title vector shape:', title_matrix.shape)
print('desc vector shape:', desc_matrix.shape)

title vector shape: (10000, 6883)
desc vector shape: (10000, 16440)


# Feature Engineering

### Reduce CountVec vector dimension with TSVD

In [44]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
reduced_title_matrix = svd.fit_transform(title_matrix)


svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
reduced_desc_matrix = svd.fit_transform(desc_matrix)

In [45]:
print(reduced_title_matrix.shape)
print(reduced_desc_matrix.shape)

(10000, 100)
(10000, 100)


### Manually Remove unnecessary columns

As we show the number of NaN values in brand, type, year, and mileage columns, we can remove these columns. Because more than 80 percent of them are NaN. 

And we can also remove platform columns, because it connot help us in prediction.

In [46]:
def remove_unnecessary_features(dataframe):
    selected_features = ['title', 'desc', 'image_count', 'price', 'cat1']
    dataframe = dataframe[selected_features]
    dataframe.rename(columns={'cat1': 'category'}, inplace=True)
    return dataframe

In [47]:
df = remove_unnecessary_features(df)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.rename(columns={'cat1': 'category'}, inplace=True)


Unnamed: 0,title,desc,image_count,price,category
0,ست مبلمان نهارخوری ٩ نفره,کلاسیک استثنایی چرم مالزی چوب راش العاده...,5,3850000,for-the-home
1,گلدون مصنوعی نخل,"سلام,یک عدد گلدون نخل سه طبقه سالم پایه...",1,30000,for-the-home
2,لودر کاتر پیلار 950,سریال 43 j شکستگی سه حلقه لاستیک حلقه ...,4,-1,vehicles
3,مبل راحتی نفره بامیز مبلی,مبل راحتی نفره شامل سه نفره عدد دونفره ...,4,600000,for-the-home
4,ماشین شارژی,شارژی کنترلی سویچ حمل 35 کیلو صندلی نفر...,2,450000,personal
...,...,...,...,...,...
9995,مانتو مجلسی قرمز سایز 44 46,پوشیده,4,55000,personal
9996,لباس مجلسییییی,لباس گیپور عکس مشخصه تمیز ۱بار تن همراه...,3,70000,personal
9997,HUAWEI g6,نونو همراه جبه,3,400000,electronic-devices
9998,نمکپاش قندان,اجناس دوست بپیوندید,1,41000,for-the-home


### New dataset

In [48]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_price = pd.DataFrame(scaler.fit_transform(df[['price']]), columns=['price'])
df = pd.concat([df, pd.DataFrame(reduced_title_matrix)], axis=1)
df = pd.concat([df, pd.DataFrame(reduced_desc_matrix)], axis=1).drop(columns=['title', 'desc'])
df.columns = df.columns.astype(str)

df

Unnamed: 0,image_count,price,category,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,5,3850000,for-the-home,0.337897,-0.196571,0.548477,-0.009931,0.031406,-0.137060,-0.012310,...,0.037930,0.025205,0.063707,-0.424549,-0.003583,0.017697,-0.054612,0.328081,0.022685,-0.008915
1,1,30000,for-the-home,0.000133,-0.000064,0.000088,0.001816,0.000073,-0.000136,-0.000017,...,-0.095165,-0.142137,0.156994,0.022871,-0.013930,0.163717,-0.124619,-0.180448,-0.109227,0.173319
2,4,-1,vehicles,0.002804,0.004573,0.000007,-0.000018,-0.004847,-0.001314,0.001598,...,-0.269281,-0.322366,-0.086429,-0.009075,-0.134687,-0.137713,0.001380,-0.110790,-0.031407,-0.044108
3,4,600000,for-the-home,0.637419,-0.351754,1.423672,-0.039839,0.026348,0.002904,-0.187817,...,0.198882,-0.027814,0.055814,0.037193,0.078730,0.097205,-0.086396,0.030771,0.114432,-0.086944
4,2,450000,personal,0.005733,0.003237,0.003110,0.007370,-0.000946,0.006186,0.029635,...,-0.055286,0.103238,0.062498,0.007425,0.155020,-0.041577,0.118764,-0.041870,0.056849,0.037823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,4,55000,personal,0.021811,0.002863,0.010264,1.104038,0.029158,-0.039156,-0.052885,...,-0.023348,0.032159,-0.005572,0.002680,0.009611,-0.000180,0.010297,0.005354,0.008578,-0.002012
9996,3,70000,personal,0.014050,0.004402,0.008232,0.596991,0.010877,-0.037519,-0.013260,...,-0.321481,-0.306670,-0.011376,0.159756,-0.013640,-0.116444,0.229726,0.078714,-0.176663,-0.147841
9997,3,400000,electronic-devices,0.000033,0.000024,-0.000006,0.000018,-0.000126,0.000412,0.000836,...,0.037467,0.090573,-0.078273,-0.095403,-0.066012,-0.025944,0.067507,0.018785,-0.168899,-0.154260
9998,1,41000,for-the-home,0.000120,-0.000061,-0.000015,0.000078,-0.000004,-0.000075,0.000377,...,0.000226,-0.012256,0.001109,-0.002149,-0.015264,-0.003055,0.005889,0.000866,-0.002923,-0.005229


In [49]:
df['price'] = scaled_price

In [50]:
df

Unnamed: 0,image_count,price,category,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,5,0.016651,for-the-home,0.337897,-0.196571,0.548477,-0.009931,0.031406,-0.137060,-0.012310,...,0.037930,0.025205,0.063707,-0.424549,-0.003583,0.017697,-0.054612,0.328081,0.022685,-0.008915
1,1,-0.239945,for-the-home,0.000133,-0.000064,0.000088,0.001816,0.000073,-0.000136,-0.000017,...,-0.095165,-0.142137,0.156994,0.022871,-0.013930,0.163717,-0.124619,-0.180448,-0.109227,0.173319
2,4,-0.241960,vehicles,0.002804,0.004573,0.000007,-0.000018,-0.004847,-0.001314,0.001598,...,-0.269281,-0.322366,-0.086429,-0.009075,-0.134687,-0.137713,0.001380,-0.110790,-0.031407,-0.044108
3,4,-0.201657,for-the-home,0.637419,-0.351754,1.423672,-0.039839,0.026348,0.002904,-0.187817,...,0.198882,-0.027814,0.055814,0.037193,0.078730,0.097205,-0.086396,0.030771,0.114432,-0.086944
4,2,-0.211733,personal,0.005733,0.003237,0.003110,0.007370,-0.000946,0.006186,0.029635,...,-0.055286,0.103238,0.062498,0.007425,0.155020,-0.041577,0.118764,-0.041870,0.056849,0.037823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,4,-0.238266,personal,0.021811,0.002863,0.010264,1.104038,0.029158,-0.039156,-0.052885,...,-0.023348,0.032159,-0.005572,0.002680,0.009611,-0.000180,0.010297,0.005354,0.008578,-0.002012
9996,3,-0.237258,personal,0.014050,0.004402,0.008232,0.596991,0.010877,-0.037519,-0.013260,...,-0.321481,-0.306670,-0.011376,0.159756,-0.013640,-0.116444,0.229726,0.078714,-0.176663,-0.147841
9997,3,-0.215091,electronic-devices,0.000033,0.000024,-0.000006,0.000018,-0.000126,0.000412,0.000836,...,0.037467,0.090573,-0.078273,-0.095403,-0.066012,-0.025944,0.067507,0.018785,-0.168899,-0.154260
9998,1,-0.239206,for-the-home,0.000120,-0.000061,-0.000015,0.000078,-0.000004,-0.000075,0.000377,...,0.000226,-0.012256,0.001109,-0.002149,-0.015264,-0.003055,0.005889,0.000866,-0.002923,-0.005229


In [None]:
df.to_csv('exp3-countvec-tsvd-')

# Logistic Regression with Heterogeneity (flow 3)

In [18]:
categories_count = df['category'].value_counts()

categories_count

category
for-the-home          3044
vehicles              2208
electronic-devices    1738
personal              1525
leisure-hobbies        849
businesses             636
Name: count, dtype: int64

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train, test = train_test_split(df, test_size=0.2)

X_train = train.drop(columns=['category'])
y_train = train['category']

X_test = test.drop(columns=['category'])
y_test = test['category']

clf = LogisticRegression(random_state=0, max_iter=300).fit(X_train, y_train)

preds = clf.predict(X_test)
print(accuracy_score(preds, y_test))

0.809


In [20]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


def conf_matrix(y_test, pred_test):
    
    # Creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(6), range(6))
   
    #Ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5) 
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)


In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds, target_names=df['category'].unique()))

                    precision    recall  f1-score   support

      for-the-home       0.69      0.38      0.49       133
          vehicles       0.85      0.86      0.85       321
          personal       0.72      0.91      0.80       625
electronic-devices       0.80      0.59      0.68       166
        businesses       0.85      0.76      0.80       304
   leisure-hobbies       0.93      0.88      0.91       451

          accuracy                           0.81      2000
         macro avg       0.81      0.73      0.76      2000
      weighted avg       0.81      0.81      0.80      2000



# Logistic Regression with Change loss funciton (flow 4)

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train, test = train_test_split(df, test_size=0.2)

X_train = train.drop(columns=['category'])
y_train = train['category']

X_test = test.drop(columns=['category'])
y_test = test['category']

clf = LogisticRegression(random_state=10, solver='newton-cg', class_weight='balanced').fit(X_train, y_train)

preds = clf.predict(X_test)
print(accuracy_score(preds, y_test))

0.759


In [28]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds, target_names=df['category'].unique()))

                    precision    recall  f1-score   support

      for-the-home       0.30      0.64      0.41       128
          vehicles       0.85      0.81      0.83       334
          personal       0.86      0.71      0.78       571
electronic-devices       0.57      0.72      0.64       178
        businesses       0.82      0.75      0.78       320
   leisure-hobbies       0.95      0.84      0.89       469

          accuracy                           0.76      2000
         macro avg       0.72      0.75      0.72      2000
      weighted avg       0.81      0.76      0.78      2000

