In [56]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import linear_kernel
# %matplotlib inline
plt.style.use("ggplot")

## Read file.

In [57]:
df = pd.read_csv("precessed_data.csv")

In [58]:
df.head(5)

Unnamed: 0,id,Workplaces,College,School,Places,Gender
0,1,no_workplaces,no_college,no_school,no_places,female
1,2,works at esquel group,no_college,no_school,bien hoa angiang an giang vietnam,notknown
2,3,long xuyen,quan tri mang truong trung cap cong nghe bach ...,no_school,long xuyen long xuyen,notknown
3,4,no_workplaces,no_college,no_school,thanh hoa thanh hoa,male
4,5,works at binh duong viet nammay 16 2020 present,no_college,no_school,tan chau,female


## Loc data ko có thông tin ở 4 cột.

In [59]:
df_no_show = df.copy().loc[(df.Workplaces == 'no_workplaces') & \
    (df.College	 == 'no_college') & \
        (df.School == 'no_school') &\
            (df.Places == 'no_places')]

In [60]:
list_no_show=df_no_show.id.tolist()

In [61]:
len(list_no_show)

3168

In [62]:
data = df.loc[~df.id.isin(list_no_show)]

In [63]:
data

Unnamed: 0,id,Workplaces,College,School,Places,Gender
1,2,works at esquel group,no_college,no_school,bien hoa angiang an giang vietnam,notknown
2,3,long xuyen,quan tri mang truong trung cap cong nghe bach ...,no_school,long xuyen long xuyen,notknown
3,4,no_workplaces,no_college,no_school,thanh hoa thanh hoa,male
4,5,works at binh duong viet nammay 16 2020 present,no_college,no_school,tan chau,female
5,6,ben tre can tho,can tho university,truong thpt nguyen trai ben tre,ben tre ben ben tre vietnam,male
...,...,...,...,...,...,...
16237,19718,ho chi minh city vietnam,tc kt nn,thpt nguyen thi minh khai bt,ho chi minh city vietnam bac binh,female
16238,19719,no_workplaces,no_college,no_school,hanoi vietnam,female
16239,19720,no_workplaces,quan tri kinh doanh bvu truong dai hoc ba ria ...,truong thpt dinh tien hoang tp vung tau truong...,vung tau ho chi minh city vietnam,male
16240,19721,no_workplaces,no_college,thpt nguyen chi thanh phu tan an giang,angiang an giang vietnam,female


In [64]:
data.shape

(13074, 6)

In [65]:
data.to_csv("final_data.csv", index = False)

## Combine 4 column into string columns.

In [66]:
data = pd.read_csv("final_data.csv")

In [67]:
data.head()

Unnamed: 0,id,Workplaces,College,School,Places,Gender
0,2,works at esquel group,no_college,no_school,bien hoa angiang an giang vietnam,notknown
1,3,long xuyen,quan tri mang truong trung cap cong nghe bach ...,no_school,long xuyen long xuyen,notknown
2,4,no_workplaces,no_college,no_school,thanh hoa thanh hoa,male
3,5,works at binh duong viet nammay 16 2020 present,no_college,no_school,tan chau,female
4,6,ben tre can tho,can tho university,truong thpt nguyen trai ben tre,ben tre ben ben tre vietnam,male


In [68]:
data.index

RangeIndex(start=0, stop=13074, step=1)

In [69]:
data['string']=data[['Workplaces', 'College','School', 'Places','Gender' ]].agg(' '.join, axis=1)

In [70]:
data.string

0        works at esquel group no_college no_school bie...
1        long xuyen quan tri mang truong trung cap cong...
2        no_workplaces no_college no_school thanh hoa t...
3        works at binh duong viet nammay 16 2020 presen...
4        ben tre can tho can tho university truong thpt...
                               ...                        
13069    ho chi minh city vietnam tc kt nn thpt nguyen ...
13070    no_workplaces no_college no_school hanoi vietn...
13071    no_workplaces quan tri kinh doanh bvu truong d...
13072    no_workplaces no_college thpt nguyen chi thanh...
13073    works at phu tan an giang no_college no_school...
Name: string, Length: 13074, dtype: object

## Content-Based.

In [76]:
df=data[['id'\
,'string']]

In [77]:
df

Unnamed: 0,id,string
0,2,works at esquel group no_college no_school bie...
1,3,long xuyen quan tri mang truong trung cap cong...
2,4,no_workplaces no_college no_school thanh hoa t...
3,5,works at binh duong viet nammay 16 2020 presen...
4,6,ben tre can tho can tho university truong thpt...
...,...,...
13069,19718,ho chi minh city vietnam tc kt nn thpt nguyen ...
13070,19719,no_workplaces no_college no_school hanoi vietn...
13071,19720,no_workplaces quan tri kinh doanh bvu truong d...
13072,19721,no_workplaces no_college thpt nguyen chi thanh...


In [90]:
vectorizer = CountVectorizer()
X1 = vectorizer.fit_transform(df["string"])


<13074x4263 sparse matrix of type '<class 'numpy.int64'>'
	with 188532 stored elements in Compressed Sparse Row format>

In [103]:
vectorizer.get_feature_names_out()

array(['000', '01', '012735258888', ..., 'zoom', 'zuo', 'zviet'],
      dtype=object)

In [104]:
d = pd.DataFrame(X1.toarray(), columns=vectorizer.get_feature_names_out())


Unnamed: 0,000,01,012735258888,03,0328,0334887973,0343511696,0353171445,0374122212,0571000054485,...,zigexn,zigvy,zin,zingplay,zip,zo,zone,zoom,zuo,zviet
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
d

Unnamed: 0,000,01,012735258888,03,0328,0334887973,0343511696,0353171445,0374122212,0571000054485,...,zigexn,zigvy,zin,zingplay,zip,zo,zone,zoom,zuo,zviet
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13072,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
df = df[['id']]
df = pd.concat([df, d], axis=1)
df.head()

Unnamed: 0,id,000,01,012735258888,03,0328,0334887973,0343511696,0353171445,0374122212,...,zigexn,zigvy,zin,zingplay,zip,zo,zone,zoom,zuo,zviet
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
df.to_csv('data/reprocessed_data/features_vectors.csv', index=False)