# Import Data

In [13]:
import pandas as pd
import numpy as np

In [14]:
#keyword embedding
import io
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # warnings for tf
import re
import shutil
import string

#pip install tensorflow
import tensorflow as tf

from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [15]:
#import right-wing news dataset
brainded_right = pd.read_csv('/home/zoetustain/code/zulu-tango/news_and_echo_bubbles/raw_data/braindedright.csv')
#convert date+time column into separate columns
brainded_right[['pdate','time']] = brainded_right['pdate'].str.split(' ', n=1, expand=True)
brainded_right['pdate'] = pd.to_datetime(brainded_right['pdate'])

# we only want relatively recent news
mask = brainded_right['pdate'] > '2020-01-01'
brainded_right = brainded_right[mask].reset_index()

# import left-wing news dataset
brainded_left = pd.read_csv('/home/zoetustain/code/zulu-tango/news_and_echo_bubbles/raw_data/braindedleft.csv')
brainded_left[['pdate','time']] = brainded_left['pdate'].str.split(' ', n=1, expand=True)
brainded_left['pdate'] = pd.to_datetime(brainded_left['pdate'])

mask = brainded_left['pdate'] > '2020-01-01'
brainded_left = brainded_left[mask].reset_index()

# drop empty rows
brainded_right = brainded_right.dropna().reset_index()
brainded_left = brainded_left.dropna().reset_index()

In [16]:
brainded_left.drop(columns={'level_0','index','Unnamed: 0'},inplace=True)
brainded_right.drop(columns={'level_0','index','Unnamed: 0'},inplace=True)

# Prepare data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
#add classification column to data
brainded_left['classifier'] = 0
brainded_right['classifier'] = 1

In [19]:
data_left = brainded_left[['text','classifier']]
data_right = brainded_right[['text','classifier']]
data_combined = data_left.merge(data_right,how='outer')

In [20]:
X = data_combined[['text']]
y = data_combined.classifier

# Text Vectorisation

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# needs to be on processed data - the vectorisation below has numbers in it

#run grid searhc on tokeniser

In [12]:
# Training it on the texts
vectorised_words = pd.DataFrame(tf_idf_vectorizer.fit_transform(X.text).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

vectorised_words

Unnamed: 0,00,000,01,02,0274,03,035,04,05,06,...,zoonotic,zte,zuckerberg,zuma,às,áñez,état,être,órgão,única
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.071799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.015113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.015113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2876,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2877,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2878,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2879,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Gradient Boosting Classifier

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,max_depth=2,random_state=0)

In [16]:
clf.fit(X_train,y_train)

In [17]:
clf.score(X_test, y_test)

0.8405545927209706

# Random Forest Classification

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [19]:
X, y = make_classification(n_samples=1000, n_features=24000,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)

In [20]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [21]:
clf.fit(X_train, y_train)

In [22]:
clf.score(X_test,y_test)

0.5545927209705372

# Optimise classification parameters

In [3]:
#grid search on gradient boosting

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [55]:
pipeline = Pipeline([
           ('tfidf', TfidfVectorizer()),
           ('clf', GradientBoostingClassifier()),
])

In [56]:
pipeline

In [57]:
pipeline.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()),
  ('clf', GradientBoostingClassifier())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'clf': GradientBoostingClassifier(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'clf__ccp_alpha': 0.0,
 'clf__criterion': 'friedman_mse',
 'clf__init': None,
 'clf__learning_rate': 0.1,
 'clf__loss': 'log_loss',
 'clf__max_depth': 3,
 'clf__max_features': None,
 'clf__max_leaf_nodes': Non

In [63]:
parameters = {
        'tfidf__min_df': (1, 0.001, 0.01),
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
        'clf__learning_rate': (1,0.1,0.01),
         'clf__loss': ('log_loss','exponential'),
     'clf__min_samples_leaf': (1,5,10),
     'clf__n_estimators': (100,1000),
    }

In [64]:
# grid search
gs = GridSearchCV(pipeline, parameters, cv=3, n_jobs=8, verbose=10)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 540 candidates, totalling 1620 fits
[CV 1/3; 2108/2160] END clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=10, clf__n_estimators=1000, tfidf__max_df=0.125, tfidf__min_df=0.001, tfidf__ngram_range=(1, 3);, score=nan total time=   0.0s
[CV 3/3; 2110/2160] START clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=10, clf__n_estimators=1000, tfidf__max_df=0.125, tfidf__min_df=0.001, tfidf__ngram_range=(1, 5)
[CV 3/3; 2110/2160] END clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=10, clf__n_estimators=1000, tfidf__max_df=0.125, tfidf__min_df=0.001, tfidf__ngram_range=(1, 5);, score=nan total time=   0.0s
[CV 2/3; 2113/2160] START clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=10, clf__n_estimators=1000, tfidf__max_df=0.125, tfidf__min_df=0.01, tfidf__ngram_range=(1, 3)
[CV 2/3; 2113/2160] END clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=10, clf__n_estima

ValueError: 
All the 1620 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1620 fits failed with the following error:
Traceback (most recent call last):
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/ensemble/_gb.py", line 416, in fit
    X, y = self._validate_data(
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/utils/validation.py", line 1164, in check_X_y
    check_consistent_length(X, y)
  File "/home/zoetustain/.pyenv/versions/3.10.6/envs/echo_news_bubbles/lib/python3.10/site-packages/sklearn/utils/validation.py", line 407, in check_consistent_length
    raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [1, 1536]


In [65]:
# best estimator to fit
gs.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

[CV 1/3; 500/540] START clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=5, clf__n_estimators=1000, tfidf__min_df=1, tfidf__ngram_range=(1, 5)
[CV 1/3; 500/540] END clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=5, clf__n_estimators=1000, tfidf__min_df=1, tfidf__ngram_range=(1, 5);, score=nan total time=   0.0s
[CV 1/3; 505/540] START clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=5, clf__n_estimators=1000, tfidf__min_df=0.001, tfidf__ngram_range=(1, 5)
[CV 1/3; 505/540] END clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=5, clf__n_estimators=1000, tfidf__min_df=0.001, tfidf__ngram_range=(1, 5);, score=nan total time=   0.0s
[CV 2/3; 505/540] START clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=5, clf__n_estimators=1000, tfidf__min_df=0.001, tfidf__ngram_range=(1, 5)
[CV 2/3; 505/540] END clf__learning_rate=0.01, clf__loss=exponential, clf__min_samples_leaf=5, clf__n_estimato

In [None]:
best_clf = gs.best_estimator_
y_test = best_clf.predict(X_test)

In [None]:
# simple filename parser and output results
p = np.vectorize(lambda x: x.split('.')[0].split('/')[2])
util.write_predictions(Y_test, p(test.filenames).tolist(), 'tfidfvec_gs.csv')