# Setup

In [None]:
import os
os.environ["XLA_FLAGS"] = "--xla_force_host_platform_device_count=8"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Dataset

In [None]:
import pandas as pd
cols = ['preprocess', 'label',
        'NEG', 'POS', 'NEU',
        'num_word', 'num_char', 'num_hashtag', 'num_url',
        'num_like_post', 'num_comment_post', 'num_share_post',
        'hour', 'weekday', 'day', 'month', 'year']
df = pd.read_csv('Data/data_after_preprocessing_text_sa_timestamp.csv')

In [None]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4106 entries, 0 to 4105
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   preprocess        4106 non-null   object 
 1   label             4106 non-null   int64  
 2   NEG               4106 non-null   float64
 3   POS               4106 non-null   float64
 4   NEU               4106 non-null   float64
 5   num_word          4106 non-null   int64  
 6   num_char          4106 non-null   int64  
 7   num_hashtag       4106 non-null   int64  
 8   num_url           4106 non-null   int64  
 9   num_like_post     3998 non-null   object 
 10  num_comment_post  4094 non-null   object 
 11  num_share_post    3339 non-null   object 
 12  hour              4106 non-null   int64  
 13  weekday           4106 non-null   int64  
 14  day               4106 non-null   int64  
 15  month             4106 non-null   int64  
 16  year              4106 non-null   int64  


Unnamed: 0,preprocess,label,NEG,POS,NEU,num_word,num_char,num_hashtag,num_url,num_like_post,num_comment_post,num_share_post,hour,weekday,day,month,year
0,cần các bậc phụ_huynh xã ngũ thái lên_tiếng kh...,1,0.985327,0.004864,0.009808,34,182,0,0,45,15,8,6,1,17,3,2020
1,kêu_gọi ăn_chay cầu_nguyện xin chúa cứu khỏi d...,1,0.659123,0.213645,0.127232,10,62,0,0,979,39,138,19,0,10,2,2020
2,giàn khoan dầu_khí gặp sự_cố chết người giàn k...,1,0.929886,0.016793,0.05332,41,212,0,0,85,13,61,12,3,16,4,2020
3,thuận_lợi có ca dương_tính cv19 rồi đó mọi ngư...,1,0.004563,0.982044,0.013393,28,151,0,0,114,12,5,10,0,3,8,2020
4,sa_pa cho 9 người khách nước_ngoài đi cùng chu...,1,0.038136,0.090191,0.871673,13,65,0,0,166,4,21,2,5,7,3,2020


In [None]:
df[df['num_share_post'] == 'unknown']

Unnamed: 0,preprocess,label,NEG,POS,NEU,num_word,num_char,num_hashtag,num_url,num_like_post,num_comment_post,num_share_post,hour,weekday,day,month,year
5,mọi người ra đường nhớ đeo khẩu_trang nhã đã c...,1,0.005398,0.963102,0.031501,17,80,0,0,unknown,unknown,unknown,5,6,9,8,2020
11,danh_hài thuý nga qua_đời để lại gia_tài triệu...,1,0.007171,0.946044,0.046785,5,56,0,0,unknown,unknown,unknown,2,1,30,6,2020
12,quanh khu_vực xuân mai mọi người cẩn_trọng chú...,1,0.965107,0.016469,0.018424,83,457,0,0,unknown,unknown,unknown,13,2,20,5,2020
14,ban muốn mình là người sống_sót trống con_số 9...,1,0.066749,0.834880,0.098371,195,897,0,0,unknown,unknown,unknown,8,0,24,2,2020
18,vụ bắt_cóc con_nít thôn 5 nông sơn sao không t...,1,0.202187,0.063569,0.734244,14,58,0,0,unknown,unknown,unknown,0,3,16,4,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3978,mọi người ơi hạn_chế za đường nhé vì 3nguoi nh...,1,0.970915,0.007577,0.021508,68,348,0,0,unknown,unknown,unknown,3,3,6,2,2020
4003,thông_báo bệnh_viện thái_hoà hồng_ngự mới tiếp...,1,0.179704,0.087048,0.733249,51,266,0,0,unknown,unknown,unknown,8,0,9,3,2020
4066,tin nóng đã tìm ra nguyên nhiên gây tử_vong ch...,1,0.468428,0.039129,0.492443,236,1227,0,0,unknown,unknown,unknown,1,6,18,8,2019
4088,xong ning giang toang rồi ninh_giang dính covi...,1,0.051228,0.080587,0.868185,12,73,0,0,unknown,unknown,unknown,15,6,26,7,2020


In [None]:
df[df['num_share_post'] == '1 share']

Unnamed: 0,preprocess,label,NEG,POS,NEU,num_word,num_char,num_hashtag,num_url,num_like_post,num_comment_post,num_share_post,hour,weekday,day,month,year
6,tin chấn_động nhân_loại được cứu 100 người thử...,1,0.003936,0.972723,0.023342,62,328,0,0,21,5,1 share,15,4,20,3,2020
54,người đàn_ông hồi_giáo tươi_cười khi nhà_thờ đ...,1,0.027069,0.181464,0.791467,17,100,0,0,82,13,1 share,15,2,15,4,2020
58,ngay cả khí_vị rút xâm_nhập vào miệng bạn nước...,1,0.066796,0.518836,0.414368,37,197,0,0,17,2,1 share,16,5,29,2,2020
168,bcao cô và các bố_mẹ e vừa dự cuộc họp khẩn củ...,1,0.212937,0.151256,0.635807,201,974,0,0,55,4,1 share,8,2,18,3,2020
172,tối nay từ 11 40 tối không ai nên đi ra đường ...,1,0.948462,0.028394,0.023144,42,235,0,0,70,5,1 share,5,6,29,3,2020
175,bài_thuốc mỗi sáng khiến tôi khoẻ_mạnh khuyến_...,1,0.005189,0.983623,0.011189,81,458,0,0,10,5,1 share,11,2,29,4,2020
889,mọi người cảnh_giác nhé trưa hôm_nay cô_mình m...,1,0.980942,0.005269,0.01379,46,212,0,0,91,33,1 share,14,6,10,3,2019
1332,thuốc chữa sốt_rét chữa covid ở mỹ trung hay b...,1,0.028625,0.550837,0.420538,38,188,0,0,55,24,1 share,19,0,23,3,2020
1483,nhóm du_khách hàn_quốc sau khi từ_chối cách_ly...,1,0.984078,0.003857,0.012064,30,177,0,0,32,22,1 share,15,2,26,2,2020
2178,bcao cô và các bố_mẹ e vừa dự cuộc họp khẩn củ...,1,0.212937,0.151256,0.635807,201,974,0,0,55,4,1 share,8,2,18,3,2020


In [None]:
df[df['num_comment_post'] == '1 comment']

Unnamed: 0,preprocess,label,NEG,POS,NEU,num_word,num_char,num_hashtag,num_url,num_like_post,num_comment_post,num_share_post,hour,weekday,day,month,year
36,phó chủ_tịch thường_trực ubnd thành_phố hồ_chí...,1,0.985244,0.004743,0.010013,60,305,0,0,16,1 comment,0,9,0,30,3,2020
48,lô thuốc đầu_tiên chữa khỏi bệnh viêm phổi vir...,1,0.006129,0.906991,0.08688,17,88,0,0,12,1 comment,0,14,2,18,3,2020
151,tỷ_phú bill gates là người tung virus covid qu...,1,0.052209,0.056453,0.891337,13,83,0,0,21,1 comment,0,6,2,5,2,2020
725,à nội đã hoãn giải đua f1 do dịch covid,0,0.092729,0.070392,0.83688,9,43,0,0,29,1 comment,0,15,5,7,3,2020
812,phó chủ_tịch thường_trực ubnd thành_phố hồ_chí...,1,0.985244,0.004743,0.010013,60,305,0,0,16,1 comment,0,9,0,30,3,2020
2427,nghi_ngờ virus 2019 covid là vũ_khí_sinh_học b...,1,0.944405,0.011982,0.043612,55,368,0,0,20,1 comment,6,6,2,5,2,2020
3182,lô thuốc đầu_tiên chữa khỏi bệnh viêm phổi vir...,1,0.006129,0.906991,0.08688,17,88,0,0,12,1 comment,0,14,2,18,3,2020
3658,nếu ai định đi du_lịch đặt vé máy_bay thì hay ...,1,0.030938,0.105952,0.86311,52,250,0,0,10,1 comment,0,5,2,29,7,2020


In [None]:
df[df['num_like_post'] == '1 like']

Unnamed: 0,preprocess,label,NEG,POS,NEU,num_word,num_char,num_hashtag,num_url,num_like_post,num_comment_post,num_share_post,hour,weekday,day,month,year


In [None]:
df['num_comment_post'] = df['num_comment_post'].replace('1 comment', '1')
df['num_share_post'] = df['num_share_post'].replace('1 share', '1')

In [None]:
df['num_comment_post'] = df['num_comment_post'].astype(str)
df['num_comment_post'] = df['num_comment_post'].where(~df['num_comment_post'].str.contains(r'[a-zA-Z]'), other=None)

In [None]:
df['num_like_post'] = df['num_like_post'].astype(str)
df['num_like_post'] = df['num_like_post'].where(~df['num_like_post'].str.contains(r'[a-zA-Z]'), other=None)

In [None]:
df['num_share_post'] = df['num_share_post'].astype(str)
df['num_share_post'] = df['num_share_post'].where(~df['num_share_post'].str.contains(r'[a-zA-Z]'), other=None)

In [None]:
df['num_share_post'] = pd.to_numeric(df['num_share_post'], errors='coerce').astype(np.int64)
df['num_like_post'] = pd.to_numeric(df['num_like_post'], errors='coerce').astype(np.int64)
df['num_comment_post'] = pd.to_numeric(df['num_comment_post'], errors='coerce').astype(np.64)

In [None]:
mean_share = df['num_share_post'].mean()
mean_share = round(mean_share)
print((mean_share))

475.3860955927995


In [None]:
mean_comment = df['num_comment_post'].mean()
mean_comment = round(mean_comment)
print((mean_comment))

123.70842767295598


In [None]:
mean_like = df['num_like_post'].mean()
mean_like = round(mean_like)
print((mean_like))

1885.0152101056974


In [None]:
df['num_share_post'] = df['num_share_post'].fillna(mean_share)
df['num_comment_post'] = df['num_comment_post'].fillna(mean_comment)
df['num_like_post'] = df['num_like_post'].fillna(mean_like)

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4106 entries, 0 to 4105
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   preprocess        4106 non-null   object 
 1   label             4106 non-null   int64  
 2   NEG               4106 non-null   float64
 3   POS               4106 non-null   float64
 4   NEU               4106 non-null   float64
 5   num_word          4106 non-null   int64  
 6   num_char          4106 non-null   int64  
 7   num_hashtag       4106 non-null   int64  
 8   num_url           4106 non-null   int64  
 9   num_like_post     4106 non-null   float64
 10  num_comment_post  4106 non-null   float64
 11  num_share_post    4106 non-null   float64
 12  hour              4106 non-null   int64  
 13  weekday           4106 non-null   int64  
 14  day               4106 non-null   int64  
 15  month             4106 non-null   int64  
 16  year              4106 non-null   int64  


In [None]:
df.head()

Unnamed: 0,preprocess,label,NEG,POS,NEU,num_word,num_char,num_hashtag,num_url,num_like_post,num_comment_post,num_share_post,hour,weekday,day,month,year
0,cần các bậc phụ_huynh xã ngũ thái lên_tiếng kh...,1,0.985327,0.004864,0.009808,34,182,0,0,45.0,15.0,8.0,6,1,17,3,2020
1,kêu_gọi ăn_chay cầu_nguyện xin chúa cứu khỏi d...,1,0.659123,0.213645,0.127232,10,62,0,0,979.0,39.0,138.0,19,0,10,2,2020
2,giàn khoan dầu_khí gặp sự_cố chết người giàn k...,1,0.929886,0.016793,0.05332,41,212,0,0,85.0,13.0,61.0,12,3,16,4,2020
3,thuận_lợi có ca dương_tính cv19 rồi đó mọi ngư...,1,0.004563,0.982044,0.013393,28,151,0,0,114.0,12.0,5.0,10,0,3,8,2020
4,sa_pa cho 9 người khách nước_ngoài đi cùng chu...,1,0.038136,0.090191,0.871673,13,65,0,0,166.0,4.0,21.0,2,5,7,3,2020


In [None]:
df.to_csv('Data/data_after_preprocessing_text_sa_timestamp_numeric.csv')

In [None]:
pd.DataFrame(df['label'].value_counts())

Unnamed: 0,label
0,3348
1,758
