# Import Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#keyword embedding
import io
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # warnings for tf
import re
import shutil
import string

#pip install tensorflow
import tensorflow as tf

from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

2023-11-28 11:11:38.310548: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-28 11:11:38.310615: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-28 11:11:38.311931: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
#import right-wing news dataset
brainded_right = pd.read_csv('/home/zoetustain/code/zulu-tango/news_and_echo_bubbles/raw_data/braindedright.csv')
#convert date+time column into separate columns
brainded_right[['pdate','time']] = brainded_right['pdate'].str.split(' ', n=1, expand=True)
brainded_right['pdate'] = pd.to_datetime(brainded_right['pdate'])

# we only want relatively recent news
mask = brainded_right['pdate'] > '2020-01-01'
brainded_right = brainded_right[mask].reset_index()

# import left-wing news dataset
brainded_left = pd.read_csv('/home/zoetustain/code/zulu-tango/news_and_echo_bubbles/raw_data/braindedleft.csv')
brainded_left[['pdate','time']] = brainded_left['pdate'].str.split(' ', n=1, expand=True)
brainded_left['pdate'] = pd.to_datetime(brainded_left['pdate'])

mask = brainded_left['pdate'] > '2020-01-01'
brainded_left = brainded_left[mask].reset_index()

# drop empty rows
brainded_right = brainded_right.dropna().reset_index()
brainded_left = brainded_left.dropna().reset_index()

In [4]:
brainded_left.drop(columns={'level_0','index','Unnamed: 0'},inplace=True)
brainded_right.drop(columns={'level_0','index','Unnamed: 0'},inplace=True)

# Prepare data

In [5]:
from sklearn.model_selection import train_test_split

In [7]:
#add classification column to data
brainded_left['classifier'] = 0
brainded_right['classifier'] = 1

In [8]:
data_left = brainded_left[['text','classifier']]
data_right = brainded_right[['text','classifier']]
data_combined = data_left.merge(data_right,how='outer')

In [9]:
X = data_combined[['text']]
y = data_combined.classifier

# Text Vectorisation

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# needs to be on processed data - the vectorisation below has numbers in it

In [12]:
# Training it on the texts
vectorised_words = pd.DataFrame(tf_idf_vectorizer.fit_transform(X.text).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

vectorised_words

Unnamed: 0,00,000,01,02,0274,03,035,04,05,06,...,zoonotic,zte,zuckerberg,zuma,às,áñez,état,être,órgão,única
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.071799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.015113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.015113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2876,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2877,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2878,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2879,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(vectorised_words,y,test_size=0.2)

# Gradient Boosting Classifier

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

In [15]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,max_depth=2,random_state=0)

In [16]:
clf.fit(X_train,y_train)

In [17]:
clf.score(X_test, y_test)

0.8405545927209706

# Random Forest Classification

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [19]:
X, y = make_classification(n_samples=1000, n_features=24000,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)

In [20]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [21]:
clf.fit(X_train, y_train)

In [22]:
clf.score(X_test,y_test)

0.5545927209705372