# Mô hình dự đoán tương tác dựa vào thời gian và nội dung 

## Chuẩn bị dữ liệu 

In [24]:
import pandas as pd 
import numpy as np 
import sklearn
import datetime, re
import underthesea

In [25]:
gam_1_df = pd.read_csv('Data\GAM_data_sub.csv')
gam_2_df = pd.read_csv('Data\GAMeSportsVN.csv')

main_df = pd.concat([gam_2_df, gam_1_df])

main_df = main_df.dropna(subset= [ "text", "time"])
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1205 entries, 0 to 989
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   post_id                        1205 non-null   int64  
 1   text                           1205 non-null   object 
 2   post_text                      1204 non-null   object 
 3   shared_text                    70 non-null     object 
 4   original_text                  25 non-null     object 
 5   time                           1205 non-null   object 
 6   timestamp                      1195 non-null   float64
 7   image                          844 non-null    object 
 8   image_lowquality               1203 non-null   object 
 9   images                         1200 non-null   object 
 10  images_description             1200 non-null   object 
 11  images_lowquality              1205 non-null   object 
 12  images_lowquality_description  1205 non-null   object 

### Làm sạch và xử lý missing

In [26]:
main_df = main_df.loc[:, [ 'text','time', 'image', 'video','reaction_count']]
main_df.loc[main_df['reaction_count'] == 0, 'reaction_count'] = None
main_df = main_df.dropna(subset=['reaction_count'])

In [27]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 349 entries, 0 to 987
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   text            349 non-null    object 
 1   time            349 non-null    object 
 2   image           248 non-null    object 
 3   video           80 non-null     object 
 4   reaction_count  349 non-null    float64
dtypes: float64(1), object(4)
memory usage: 16.4+ KB


### Phân tách dữ liệu

In [28]:
main_df['time'] = pd.to_datetime(main_df['time'])
main_df['weekday'] = main_df['time'].dt.day_of_week
main_df['hour'] = main_df['time'].dt.hour
main_df.drop(['time'], inplace = True, axis = 1)

In [29]:
main_df['image'] = ~ main_df['image'].isna()
main_df['video'] = ~main_df['video'].isna()

In [30]:
main_df.head()

Unnamed: 0,text,image,video,reaction_count,weekday,hour
0,"Palette: Góc đẹp, nhìn em mở nè anh em! 😎 #GAM...",False,True,226.0,1,19
1,Cái gì quan trọng thì mình nhắc lại 42 lần! 🙄 ...,False,True,321.0,1,17
2,Tâm linh không đùa được đâu! 🫣\n\nXem thêm ở M...,False,True,1079.0,1,15
3,Palette tung là anh em vào hứng ngay! 🥳 #GAMTI...,False,True,970.0,0,20
4,"Khởi đầu cho các cuộc nội chiến tại CKTG, GAM ...",False,True,987.0,0,17


In [31]:
from underthesea import text_normalize
from underthesea import classify 

main_df['Tag'] = main_df['text'].apply(lambda x : classify(x)[0])

main_df.head()

Unnamed: 0,text,image,video,reaction_count,weekday,hour,Tag
0,"Palette: Góc đẹp, nhìn em mở nè anh em! 😎 #GAM...",False,True,226.0,1,19,doi_song
1,Cái gì quan trọng thì mình nhắc lại 42 lần! 🙄 ...,False,True,321.0,1,17,chinh_tri_xa_hoi
2,Tâm linh không đùa được đâu! 🫣\n\nXem thêm ở M...,False,True,1079.0,1,15,chinh_tri_xa_hoi
3,Palette tung là anh em vào hứng ngay! 🥳 #GAMTI...,False,True,970.0,0,20,doi_song
4,"Khởi đầu cho các cuộc nội chiến tại CKTG, GAM ...",False,True,987.0,0,17,the_thao


In [32]:
main_df.drop("text", axis= 1, inplace= True)

### Phân chia dữ liệu

In [33]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(main_df, test_size= 0.1, random_state=42)

In [34]:
len(train_df)

314

In [35]:
len(test_df)

35

## Mã hóa dữ liệu text

In [36]:
from sklearn.preprocessing import OneHotEncoder 

tag_encoder = OneHotEncoder()
tag_df = train_df[['Tag']]
tag_encoder_1hot = tag_encoder.fit_transform(tag_df)
tag_encoder_1hot.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [37]:
train_df[['image', 'video', 'reaction_count', 'weekday', 'hour']]

Unnamed: 0,image,video,reaction_count,weekday,hour
55,True,False,1683.0,4,9
42,True,False,2697.0,6,18
30,True,False,1734.0,2,19
9,True,False,446.0,5,18
110,True,False,2697.0,6,18
...,...,...,...,...,...
188,False,True,1333.0,1,10
71,True,False,2106.0,0,20
106,False,True,8219.0,2,13
71,False,True,3513.0,0,20


In [38]:
train_df['reaction_count'] = train_df['reaction_count'].astype('int32')

In [39]:
train_df_label = train_df['reaction_count'].copy()
train_df = train_df.drop("reaction_count", axis = 1)

### Transfrom pipeline

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


main_att = ['image', 'video', 'weekday', 'hour']
tag_att = ['Tag']


pipe_line = Pipeline([
    ('preprocessor', ColumnTransformer(transformers=[
        ('num', StandardScaler(), main_att),
        ('encode', OneHotEncoder(), tag_att)
    ]))
])


train_df_prepare = pipe_line.fit_transform(train_df)


In [41]:
train_df_prepare

array([[ 0.64375027, -0.55036766,  0.36401379, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.64375027, -0.55036766,  1.33266067, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.64375027, -0.55036766, -0.60463308, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.5533974 ,  1.81696722, -0.60463308, ...,  1.        ,
         0.        ,  0.        ],
       [-1.5533974 ,  1.81696722, -1.57327996, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.64375027, -0.55036766, -0.60463308, ...,  0.        ,
         0.        ,  1.        ]])

## Training Model

### LinearRegression

In [42]:
from sklearn.linear_model import LinearRegression 

li_re = LinearRegression()
li_re.fit(train_df_prepare, train_df_label)

In [43]:
from sklearn.metrics import mean_squared_error

train_predict = li_re.predict(train_df_prepare)
li_mse = mean_squared_error(train_predict, train_df_label)
li_rmse = np.sqrt(li_mse)
li_rmse

2926.6234123644886

###  Decision Tree

In [44]:
from sklearn.tree import DecisionTreeRegressor

de_re = DecisionTreeRegressor()
de_re.fit(train_df_prepare, train_df_label)

train_predict_de = de_re.predict(train_df_prepare)
li_rmse_de = np.sqrt(mean_squared_error(train_predict_de, train_df_label))

li_rmse_de

1066.9152347801323

In [45]:
from sklearn.model_selection import cross_val_score

de_score = cross_val_score(de_re, train_df_prepare, train_df_label, scoring="neg_mean_squared_error", cv=10)
li_score = cross_val_score(li_re, train_df_prepare, train_df_label, scoring="neg_mean_squared_error", cv=10)

def print_score(score) : 
    print("Mean : " + str(np.sqrt(-score).mean()) )
    print("STD: " + str(np.sqrt(-score).std()))

print_score(de_score)

print_score(li_score)

Mean : 3610.9724499442646
STD: 1191.7300150040035
Mean : 2982.639738197192
STD: 703.8612312439301


## Predict

In [46]:
test_x_df = test_df.drop("reaction_count", axis = 1)
test_y_df = test_df['reaction_count'].copy()

test_x_prepare = pipe_line.transform(test_x_df)

test_predict = de_re.predict(test_x_prepare)

test_rmse = np.sqrt(mean_squared_error(test_y_df, test_predict))

test_rmse

3335.865593297731