In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from random import shuffle
import pickle as pkl
import glob
import ujson
from sklearn import svm
import pandas as pd
from sklearn.metrics import classification_report

with open('../data/stopwords.txt', 'r') as f:
    STOP_WORDS = f.read().splitlines()

TOPIC_COLUMN = "content"
DATABASE_PATH = '../scrapping/demagog/dataset.csv'

df = pd.read_csv(DATABASE_PATH)
df = df.loc[df['content'].notna()]
df = df.loc[df['label'].isin(
    ['Prawda', 'Fałsz', 'Częściowy fałsz', 'Manipulacja'])]
df['category'] = df['label'].apply(lambda x: 1 if x == 'Prawda' else 0)
num = 20000
min_df = 2
max_df = 0.1
v = TfidfVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='replace',
    # strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    stop_words=STOP_WORDS,
    # token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
    ngram_range=(1, 2),
    max_features=num,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
    max_df=max_df,
    min_df=min_df)

X = v.fit_transform(df[TOPIC_COLUMN])
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    df['category'],
                                                    test_size=0.2,
                                                    random_state=42)

In [9]:
import sklearn.gaussian_process as gp
from sklearn.linear_model import LogisticRegression
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

model = BayesSearchCV(
    svm.SVC(kernel='linear', probability=True),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        # 'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
        # 'degree': Integer(1,8),
        # 'kernel': Categorical(['linear', 'poly', 'rbf']),
    },
    n_iter=32,
    scoring='f1_macro',
    random_state=42)
# model = LogisticRegression(C=1.0, max_iter=5000)
# model = svm.SVC(kernel='linear', C=1.0, probability=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.52      0.62       118
           1       0.75      0.91      0.82       187

    accuracy                           0.76       305
   macro avg       0.77      0.71      0.72       305
weighted avg       0.76      0.76      0.74       305



In [19]:
test_df = pd.read_csv("../data/database.csv", lineterminator="\n")
test_df = test_df[test_df["summary"].notna()]
X = v.transform(test_df["summary"])

In [21]:
test_df['probability_fake'] = model.predict_proba(X)[:, 0].tolist()
test_df.to_csv("../data/database_with_proba.csv")

In [None]:
df.to_pickle('../data/database_predicted.pkl')

In [None]:
import joblib
# dump
joblib.dump(model, '../models/svm.pkl')
# read
model = joblib.load('../models/svm.pkl')