In [4]:
import os
import string
from datetime import datetime
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline

import nltk
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec

from tensorflow.config.experimental import list_physical_devices, set_memory_growth
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Dropout, SpatialDropout1D, SimpleRNN
from tensorflow.keras.regularizers import l2

import optuna

from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

from cybnews.data import get_data, welf_join_text

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

print("Num GPUs Available: ", len(list_physical_devices('GPU')))

gpus = list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1


In [17]:
df = pd.read_csv('/home/tober/devel/lewagon/project/cyb-news/data/unified_data.csv', index_col=0)
df.head()

Unnamed: 0,all_text_cleaned,label
0,culture war history conversation andrew hartma...,1
1,blood collection chair market growth top manuf...,1
2,terrorist attack archive ⋆ dc gazette terroris...,1
3,christian life coach star tim storey talk past...,0
4,israel takeover internet israel takeover inter...,1


In [15]:
#data = df.sample(frac=0.5, random_state=42)

In [25]:
# {'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 300, 'subsample': 0.7}
# Text preprocessing and feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['all_text_cleaned'])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    df['label'],
    test_size=0.2,
    random_state=42
)

model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    colsample_bytree=0.6,
    learning_rate=0.2,
    max_depth=6,
    n_estimators=300,
    subsample=0.7,
    n_jobs=28
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9561749128019635
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96     15213
           1       0.97      0.94      0.96     15751

    accuracy                           0.96     30964
   macro avg       0.96      0.96      0.96     30964
weighted avg       0.96      0.96      0.96     30964



In [20]:
model.save_model('xgboost')

In [24]:
loaded_model = XGBClassifier()
loaded_model.load_model('xgboost')
y_pred = loaded_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9561749128019635
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96     15213
           1       0.97      0.94      0.96     15751

    accuracy                           0.96     30964
   macro avg       0.96      0.96      0.96     30964
weighted avg       0.96      0.96      0.96     30964



In [40]:
# Text preprocessing and feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['all_text_cleaned'])

X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

#model = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='logloss')
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')


# param_grid = {
#     'n_estimators': [300, 320, 340],#, 100, 150, 200, 250],
#     'max_depth': [5, 6, 7],
#     'learning_rate': [0.1, 0.2],
#     # 'subsample': [0.6, 0.7, 0.8, 0.9],
#     # 'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
# }

# {'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 300, 'subsample': 0.7}

param_grid = {
    'n_estimators': [260, 280, 300, 320, 340],
    'max_depth': [5, 6, 7],
    'learning_rate': [0.15, 0.2, 0.25],
    'subsample': [0.4, 0.5, 0.6, 0.7],
    'colsample_bytree': [0.4, 0.5, 0.6, 0.7]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid, 
    scoring='accuracy',
    cv=3,
    verbose=3,
    n_jobs=28,
    error_score="raise",
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 3 folds for each of 720 candidates, totalling 2160 fits
[CV 1/3] END colsample_bytree=0.4, learning_rate=0.15, max_depth=5, n_estimators=300, subsample=0.5;, score=0.920 total time=  33.6s
[CV 2/3] END colsample_bytree=0.4, learning_rate=0.15, max_depth=5, n_estimators=280, subsample=0.4;, score=0.921 total time=  34.7s
[CV 3/3] END colsample_bytree=0.4, learning_rate=0.15, max_depth=5, n_estimators=260, subsample=0.4;, score=0.922 total time=  36.2s
[CV 1/3] END colsample_bytree=0.4, learning_rate=0.15, max_depth=5, n_estimators=260, subsample=0.4;, score=0.918 total time=  36.4s
[CV 2/3] END colsample_bytree=0.4, learning_rate=0.15, max_depth=5, n_estimators=260, subsample=0.4;, score=0.920 total time=  36.8s
[CV 3/3] END colsample_bytree=0.4, learning_rate=0.15, max_depth=5, n_estimators=280, subsample=0.4;, score=0.922 total time=  36.8s
[CV 1/3] END colsample_bytree=0.4, learning_rate=0.15, max_depth=5, n_estimators=280, subsample=0.5;, score=0.921 total time=  37.7s
[CV 1

In [43]:
results_df = pd.DataFrame(grid_search.cv_results_)

# Display the DataFrame
#print(results_df)
results_df.head()
#results_df.to_csv('xgboost_reults.csv')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,36.382529,0.249636,0.071412,0.000406,0.4,0.15,5,260,0.4,"{'colsample_bytree': 0.4, 'learning_rate': 0.1...",0.917656,0.9203,0.921996,0.919984,0.001786,592
1,41.124581,0.787537,0.070274,0.000736,0.4,0.15,5,260,0.5,"{'colsample_bytree': 0.4, 'learning_rate': 0.1...",0.919351,0.919816,0.92345,0.920872,0.001832,541
2,43.772101,1.366132,0.063285,0.007612,0.4,0.15,5,260,0.6,"{'colsample_bytree': 0.4, 'learning_rate': 0.1...",0.922015,0.921996,0.928052,0.924021,0.00285,243
3,46.101681,2.79888,0.069681,0.001688,0.4,0.15,5,260,0.7,"{'colsample_bytree': 0.4, 'learning_rate': 0.1...",0.920804,0.919574,0.927326,0.922568,0.003402,390
4,36.925366,1.981557,0.071503,0.009834,0.4,0.15,5,280,0.4,"{'colsample_bytree': 0.4, 'learning_rate': 0.1...",0.919109,0.921269,0.922481,0.920953,0.001395,534


In [None]:
# GPU
#
# n_estimators: [20, 50]
# n_jobs = 3
# -> 4:23
#
# n_estimators: [20, 50]
# n_jobs = 12
# -> 



# CPU
# n_estimators: [20, 50]
# n_jobs = 26
# -> 0:19

In [3]:
import pandas as pd
df = pd.read_csv('xgboost_reults.csv')
df.head()
df.sort_values(by=['rank_test_score'], inplace=True)
df.head()
df.iloc[0]['params']

"{'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 300, 'subsample': 0.7}"