# Flow 1:

## TF-IDF, TSVD, Uniform Sampling

# Data Loading

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../divar_reduced_posts.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,archive_by_user,brand,cat1,city,created_at,desc,id,image_count,mileage,platform,price,title,type,year
0,0,False,,for-the-home,Tehran,Tuesday 07PM,کلاسیک و شیک و استثنایی\nچرم مالزی\nچوب راش\nف...,54761638662241,5,,mobile,3850000,ست مبلمان و نهارخوری ٩ نفره,,
1,1,False,,for-the-home,Mashhad,Tuesday 07PM,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",42727631379133,1,,mobile,30000,گلدون مصنوعی نخل,,
2,2,False,,vehicles,Mashhad,Tuesday 07PM,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,63194439667302,4,,mobile,-1,لودر کاتر پیلار 950,,
3,3,False,,for-the-home,Tehran,Tuesday 07PM,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,19133025491169,4,,mobile,600000,مبل راحتی هفت نفره بامیز جلو مبلی,,
4,4,False,,personal,Karaj,Tuesday 08PM,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,58998998335444,2,,mobile,450000,ماشین شارژی,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,False,,personal,Tehran,Wednesday 07PM,اصلا پوشیده نشده نو هستش,21063397838341,4,,mobile,55000,مانتو مجلسی قرمز سایز44 46,women,
9996,9996,False,,personal,Ahvaz,Wednesday 08PM,لباس تمام گیپور. تو عکس مشخصه تمیز.فقط ۱بار تن...,42819975080363,3,,mobile,70000,لباس مجلسییییی شیک,women,
9997,9997,False,Huawei::هوآوی,electronic-devices,Karaj,Thursday 04AM,نونو همراه با جبه با همه چی,7494598410644,3,,mobile,400000,HUAWEI g6,,
9998,9998,False,,for-the-home,Tehran,Thursday 04AM,باز هم اجناس دیگه هم هست اگه دوست داشتید به تل...,24182403631548,1,,mobile,41000,نمکپاش و قندان,,


# Feature Engineering

### Remove unnecessary columns

As we show the number of NaN values in brand, type, year, and mileage columns, we can remove these columns. Because more than 80 percent of them are NaN. 

And we can also remove platform columns, because it connot help us in prediction.

In [4]:
def remove_unnecessary_features(dataframe):
    selected_features = ['title', 'desc', 'image_count', 'price', 'cat1']
    dataframe = dataframe[selected_features]
    dataframe.rename(columns={'cat1': 'category'}, inplace=True)
    return dataframe

In [5]:
df = remove_unnecessary_features(df)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.rename(columns={'cat1': 'category'}, inplace=True)


Unnamed: 0,title,desc,image_count,price,category
0,ست مبلمان و نهارخوری ٩ نفره,کلاسیک و شیک و استثنایی\nچرم مالزی\nچوب راش\nف...,5,3850000,for-the-home
1,گلدون مصنوعی نخل,"سلام,یک عدد گلدون نخل سه طبقه ی سالم دارم با پ...",1,30000,for-the-home
2,لودر کاتر پیلار 950,سریال 43j$NUMبدون شکستگی سه حلقه لاستیک نو یک ...,4,-1,vehicles
3,مبل راحتی هفت نفره بامیز جلو مبلی,مبل راحتی هفت نفره شامل سه نفره یک عدد دونفره ...,4,600000,for-the-home
4,ماشین شارژی,شارژی کنترلی سویچ حمل تا 35 کیلو صندلی برای دو...,2,450000,personal
...,...,...,...,...,...
9995,مانتو مجلسی قرمز سایز44 46,اصلا پوشیده نشده نو هستش,4,55000,personal
9996,لباس مجلسییییی شیک,لباس تمام گیپور. تو عکس مشخصه تمیز.فقط ۱بار تن...,3,70000,personal
9997,HUAWEI g6,نونو همراه با جبه با همه چی,3,400000,electronic-devices
9998,نمکپاش و قندان,باز هم اجناس دیگه هم هست اگه دوست داشتید به تل...,1,41000,for-the-home


# Data Encoding and Feature Engineering

### Encoding with TF-IDF

In [6]:
from hazm import *
import codecs


normalizer = Normalizer()
stopwords = [normalizer.normalize(x.strip()) for x in codecs.open(
    '../stopwords.txt', 'r', 'utf-8').readlines()]
stopwords.extend(['تماس', 'تلگرام', 'واتساپ', 'با سلام', 'سلام', 'فروش', 'فوری',
                 'قیمت', 'مقطوع', 'تخفیف', 'نو', 'در حد نو', 'خرید', 'اقساطی', 'تحویل', 'اقساط'])

In [7]:
from tqdm import tqdm


for idx in tqdm(range(df.shape[0])):
    title_sent = df['title'][idx].replace('$NUM' , ' ')
    desc_sent = df['desc'][idx].replace('$NUM' , ' ')
    
    df.loc[idx, 'desc'] = '  '.join([str(elem) for elem in [t for t in word_tokenize(desc_sent) if t not in stopwords]])
    df.loc[idx, 'title'] = '  '.join([str(elem) for elem in [t for t in word_tokenize(title_sent) if t not in stopwords]])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 2637.17it/s]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from hazm import Normalizer, word_tokenize


def vectorize_with_tf_idf(dataframe):
    normalizer = Normalizer()

    title_tfidf = TfidfVectorizer(
        ngram_range=(1, 1), norm='l2', preprocessor=normalizer.normalize, tokenizer=word_tokenize
    )
    title_tfidf_matrix = title_tfidf.fit_transform(df['title'].values)
#     print(title_tfidf.vocabulary_)
#     print(len(title_tfidf.vocabulary_))


    desc_tfidf = TfidfVectorizer(
        ngram_range=(1, 1), norm='l2', preprocessor=normalizer.normalize, tokenizer=word_tokenize
    )
    desc_tfidf_matrix = desc_tfidf.fit_transform(df['desc'].values)
#     print(desc_tfidf.vocabulary_)
#     print(len(desc_tfidf.vocabulary_))
    
    return title_tfidf_matrix, desc_tfidf_matrix

In [9]:
title_tfidf_matrix, desc_tfidf_matrix = vectorize_with_tf_idf(df)



In [10]:
print('title vector shape:', title_tfidf_matrix.shape)
print('desc vector shape:', desc_tfidf_matrix.shape)

title vector shape: (10000, 6883)
desc vector shape: (10000, 16440)


### Reduce tfidf vector dimension with TSVD

In [11]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
reduced_title_matrix = svd.fit_transform(title_tfidf_matrix)


svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
reduced_desc_matrix = svd.fit_transform(desc_tfidf_matrix)

In [12]:
print(reduced_title_matrix.shape)
print(reduced_desc_matrix.shape)

(10000, 100)
(10000, 100)


### New dataset

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_price = pd.DataFrame(scaler.fit_transform(df[['price']]), columns=['price'])
df = pd.concat([df, pd.DataFrame(reduced_title_matrix)], axis=1)
df = pd.concat([df, pd.DataFrame(reduced_desc_matrix)], axis=1).drop(columns=['title', 'desc'])
df.columns = df.columns.astype(str)

df

Unnamed: 0,image_count,price,category,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,5,3850000,for-the-home,0.171454,-0.062200,-0.007692,0.001936,0.000616,-0.051826,-0.156769,...,-0.002913,0.020399,0.014931,0.004725,-0.041608,0.018221,-0.016710,-0.011949,-0.000148,-0.013728
1,1,30000,for-the-home,0.000074,0.000036,0.000030,0.000208,-0.000006,-0.000073,-0.000270,...,0.015497,-0.014476,-0.018069,-0.023659,0.019660,-0.004181,-0.032781,-0.015949,-0.014606,0.017309
2,4,-1,vehicles,0.000088,0.000007,0.001177,-0.000123,0.000180,0.000871,-0.000459,...,-0.026372,0.007972,-0.005071,-0.013696,0.032250,-0.022912,-0.008527,-0.010512,-0.003433,0.006728
3,4,600000,for-the-home,0.487357,-0.289209,-0.024597,-0.005395,-0.022020,-0.025608,-0.037837,...,-0.005287,-0.001381,0.018599,0.005416,-0.018997,0.000562,-0.009133,-0.007235,0.006300,0.002512
4,2,450000,personal,0.001813,0.000474,0.003846,0.004088,0.004950,0.004777,-0.002347,...,-0.012292,0.003556,-0.014981,0.020269,-0.003238,0.004127,-0.015352,-0.010302,-0.001340,0.006742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,4,55000,personal,0.003755,0.000088,0.036091,0.290491,-0.011098,-0.002299,0.001809,...,0.018376,0.029499,0.035439,-0.053063,0.026115,0.007095,-0.032548,0.020386,-0.043339,-0.035981
9996,3,70000,personal,0.005343,-0.001617,0.038371,0.313492,-0.010520,-0.004929,-0.000884,...,-0.036026,-0.005603,-0.052718,0.013900,0.146942,0.055515,-0.000247,0.042900,-0.060235,0.065512
9997,3,400000,electronic-devices,0.000005,0.000005,0.000020,0.000001,0.000035,0.000085,0.000111,...,-0.010597,-0.031722,-0.024886,0.055928,0.053822,0.030621,0.047366,0.010524,-0.006963,-0.003509
9998,1,41000,for-the-home,0.000050,0.000040,0.000020,0.000033,0.000049,0.000019,-0.000125,...,0.001378,0.005395,0.000560,0.004736,-0.002539,-0.000968,0.004816,-0.002942,-0.001170,-0.003692


In [14]:
df['price'] = scaled_price

In [15]:
df

Unnamed: 0,image_count,price,category,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,5,0.016651,for-the-home,0.171454,-0.062200,-0.007692,0.001936,0.000616,-0.051826,-0.156769,...,-0.002913,0.020399,0.014931,0.004725,-0.041608,0.018221,-0.016710,-0.011949,-0.000148,-0.013728
1,1,-0.239945,for-the-home,0.000074,0.000036,0.000030,0.000208,-0.000006,-0.000073,-0.000270,...,0.015497,-0.014476,-0.018069,-0.023659,0.019660,-0.004181,-0.032781,-0.015949,-0.014606,0.017309
2,4,-0.241960,vehicles,0.000088,0.000007,0.001177,-0.000123,0.000180,0.000871,-0.000459,...,-0.026372,0.007972,-0.005071,-0.013696,0.032250,-0.022912,-0.008527,-0.010512,-0.003433,0.006728
3,4,-0.201657,for-the-home,0.487357,-0.289209,-0.024597,-0.005395,-0.022020,-0.025608,-0.037837,...,-0.005287,-0.001381,0.018599,0.005416,-0.018997,0.000562,-0.009133,-0.007235,0.006300,0.002512
4,2,-0.211733,personal,0.001813,0.000474,0.003846,0.004088,0.004950,0.004777,-0.002347,...,-0.012292,0.003556,-0.014981,0.020269,-0.003238,0.004127,-0.015352,-0.010302,-0.001340,0.006742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,4,-0.238266,personal,0.003755,0.000088,0.036091,0.290491,-0.011098,-0.002299,0.001809,...,0.018376,0.029499,0.035439,-0.053063,0.026115,0.007095,-0.032548,0.020386,-0.043339,-0.035981
9996,3,-0.237258,personal,0.005343,-0.001617,0.038371,0.313492,-0.010520,-0.004929,-0.000884,...,-0.036026,-0.005603,-0.052718,0.013900,0.146942,0.055515,-0.000247,0.042900,-0.060235,0.065512
9997,3,-0.215091,electronic-devices,0.000005,0.000005,0.000020,0.000001,0.000035,0.000085,0.000111,...,-0.010597,-0.031722,-0.024886,0.055928,0.053822,0.030621,0.047366,0.010524,-0.006963,-0.003509
9998,1,-0.239206,for-the-home,0.000050,0.000040,0.000020,0.000033,0.000049,0.000019,-0.000125,...,0.001378,0.005395,0.000560,0.004736,-0.002539,-0.000968,0.004816,-0.002942,-0.001170,-0.003692


# Make Uniform Distribution

In [16]:
categories_count = df['category'].value_counts()

min(categories_count.values)

636

In [17]:
grouped = df.groupby('category')
uniform_df = grouped.apply(lambda x: x.sample(n=min(categories_count.values)))

In [18]:
uniform_df['category'].value_counts()

category
businesses            636
electronic-devices    636
for-the-home          636
leisure-hobbies       636
personal              636
vehicles              636
Name: count, dtype: int64

In [19]:
uniform_df = uniform_df.reset_index(drop=True)
uniform_df = uniform_df.sample(frac=1)

In [20]:
uniform_df

Unnamed: 0,image_count,price,category,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
1916,2,-0.241960,leisure-hobbies,0.000275,0.000161,0.002829,0.000668,-0.000065,0.001678,0.000932,...,0.013086,-0.025168,0.008362,0.091295,0.015664,-0.031747,-0.011498,-0.038505,0.005101,-0.052451
2221,0,-0.240953,leisure-hobbies,0.000762,0.000429,0.015545,-0.001445,0.000651,0.040641,-0.022405,...,0.013571,0.007854,0.010986,-0.002342,0.009291,-0.001305,0.000678,-0.007368,-0.001135,0.005296
2374,2,-0.240617,leisure-hobbies,0.000093,0.000066,0.000201,0.000106,0.000153,0.000054,-0.000097,...,0.014358,0.010360,-0.013794,0.009418,-0.024312,-0.023187,0.017369,-0.031570,0.000734,0.002437
3119,1,-0.241288,personal,0.011845,0.000455,0.156404,-0.004002,-0.006934,0.073106,-0.037304,...,0.043810,0.000130,-0.002895,-0.009361,-0.000302,0.015487,0.067453,-0.024571,-0.036797,0.014600
2859,2,-0.240617,personal,0.002078,0.003268,0.000947,0.000190,0.001715,-0.000146,-0.000720,...,-0.054928,0.065248,0.049696,0.095784,-0.013919,-0.085203,0.042899,-0.011604,-0.108139,-0.010924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3306,1,0.050237,vehicles,0.017380,0.004669,0.448645,-0.057987,-0.023185,-0.098053,0.023162,...,-0.003654,-0.007029,-0.002552,0.005912,-0.018992,-0.063599,-0.043507,0.047135,0.047617,-0.052775
3292,3,2.579253,vehicles,0.000342,0.000106,0.009721,-0.001201,-0.000393,0.010473,-0.005321,...,0.006920,-0.020781,-0.032218,0.000587,-0.036948,-0.006133,-0.011478,0.037758,-0.039217,-0.003923
462,0,-0.240617,businesses,0.007163,0.005239,0.001709,0.004461,0.005998,0.001067,-0.007008,...,-0.007496,0.012043,-0.005835,-0.004114,-0.015313,0.003164,0.016983,0.006201,0.004229,-0.006740
2078,2,-0.235243,leisure-hobbies,0.000040,0.000022,0.000627,0.000040,0.000099,0.000714,-0.000487,...,0.020651,0.031149,0.024529,-0.000757,0.015495,-0.013400,-0.028318,0.000792,-0.014254,0.004419


In [21]:
uniform_df.to_csv('exp1-tfidf-tsvd-uniform.csv.csv')

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train, test = train_test_split(uniform_df, test_size=0.2)

X_train = train.drop(columns=['category'])
y_train = train['category']

X_test = test.drop(columns=['category'])
y_test = test['category']

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

preds = clf.predict(X_test)
print(accuracy_score(preds, y_test))

0.7513089005235603


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
