In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb

# Загрузка данных
data = pd.read_csv('fake_or_real_news.csv')

# Разделение данных на признаки (текст) и метки (label)
X = data['text']
y = data['label']

# Разделение на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Векторизация текста с помощью TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [6]:
# Модель логистической регрессии
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Предсказание и оценка модели
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy:  0.9155485398579322
              precision    recall  f1-score   support

        FAKE       0.90      0.93      0.92       628
        REAL       0.93      0.90      0.91       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [7]:
# Модель наивного Байеса
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Предсказание и оценка модели
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy: ", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy:  0.8453038674033149
              precision    recall  f1-score   support

        FAKE       0.98      0.71      0.82       628
        REAL       0.77      0.98      0.87       639

    accuracy                           0.85      1267
   macro avg       0.87      0.84      0.84      1267
weighted avg       0.87      0.85      0.84      1267



In [13]:
# Преобразование меток в числовой формат
data['label'] = data['label'].map({'FAKE': 0, 'REAL': 1})

# Разделение данных на признаки (текст) и метки (label)
X = data['text']
y = data['label']

# Разделение на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Векторизация текста с помощью TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Преобразование данных в формат DMatrix для XGBoost
dtrain = xgb.DMatrix(X_train_tfidf, label=y_train)
dtest = xgb.DMatrix(X_test_tfidf, label=y_test)

# Параметры модели
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Обучение модели
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Предсказание и оценка модели
y_pred_xgb = (xgb_model.predict(dtest) > 0.5).astype(int)
print("XGBoost Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


Parameters: { "n_estimators" } are not used.



XGBoost Accuracy:  0.9123914759273876
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       628
           1       0.92      0.91      0.91       639

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267

