In [2]:
import pandas as pd
import spacy

In [5]:
df = pd.read_csv('csvs/political_articles2.csv')

In [6]:
df.head()
# although the 'text' column is short, we will try running basic ML

Unnamed: 0,text,label
0,Pfizer says its Covid-19 vaccine is safe and 1...,Left Data
1,The NBC-Wall Street Journal's latest national ...,Left Data
2,"September 19th, 2012 06:58 AM ET...",Left Data
3,Story highlightsCNN/ORC International poll: Si...,Left Data
4,"Washington (CNN)Dr. Anthony Fauci, the nation'...",Left Data


In [7]:
df['label'] = df['label'].apply(lambda x: 'left' if x == 'Left Data' else ('right' if x == 'Right Data' else 'center'))
df

Unnamed: 0,text,label
0,Pfizer says its Covid-19 vaccine is safe and 1...,left
1,The NBC-Wall Street Journal's latest national ...,left
2,"September 19th, 2012 06:58 AM ET...",left
3,Story highlightsCNN/ORC International poll: Si...,left
4,"Washington (CNN)Dr. Anthony Fauci, the nation'...",left
...,...,...
17357,"By Daniel De SimoneBBC NewsImage source, COUNT...",center
17358,...,center
17359,...,center
17360,President BidenJoe BidenUS could spend M month...,center


In [11]:
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000
from tqdm import tqdm

In [12]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ''  # or return None or some other placeholder
    doc = nlp(text)
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(processed_tokens)  # Join back into a string (if desired)

tqdm.pandas()  # Enable the progress bar for pandas
df['text_proc'] = df['text'].progress_apply(preprocess_text)


100%|██████████████████████████████████████████████████████████████████████████| 17362/17362 [1:00:26<00:00,  4.79it/s]


In [19]:
# Now we have the preprocessed data, we can start using a model
df.to_csv('preprocessed_text.csv', index=False)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['text_proc'], df['label'], test_size=0.3)

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [27]:
# start with a linear regression model, automatically uses softmax because of multi-class classification
model = LogisticRegression(max_iter=1000, verbose=1)

# just for progress bar purposes
for _ in tqdm(range(1)):
    model.fit(X_train, y_train)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.20s/it]


In [28]:
# try random forest
model2 = RandomForestClassifier(n_estimators=100)

for _ in tqdm(range(1)):
    model2.fit(X_train, y_train)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:56<00:00, 56.49s/it]


In [29]:
pred = model.predict(X_test)
pred2 = model2.predict(X_test)

In [30]:
# evaluate model 1 first
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

      center       0.91      0.62      0.73      1164
        left       0.73      0.89      0.80      2403
       right       0.78      0.70      0.74      1642

    accuracy                           0.77      5209
   macro avg       0.81      0.74      0.76      5209
weighted avg       0.79      0.77      0.77      5209

[[ 718  344  102]
 [  38 2149  216]
 [  36  451 1155]]


In [31]:
# evaluate model 2 now
print(classification_report(y_test, pred2))
print(confusion_matrix(y_test, pred2))

              precision    recall  f1-score   support

      center       0.97      0.52      0.68      1164
        left       0.66      0.92      0.77      2403
       right       0.78      0.60      0.68      1642

    accuracy                           0.73      5209
   macro avg       0.81      0.68      0.71      5209
weighted avg       0.77      0.73      0.72      5209

[[ 608  475   81]
 [   2 2209  192]
 [  15  638  989]]


In [33]:
# save the models and vectorizer
import joblib

In [36]:
joblib.dump(model, 'large_data_log.joblib')
joblib.dump(model2, 'large_data_rf.joblib')

joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

### Try using a neural network now

In [35]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


In [79]:
model3 = Sequential()

model3.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
model3.add(Dropout(0.2))
model3.add(Dense(3, activation='softmax'))  # use for multi-class classification (center, left, right)

model3.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [80]:
y_train_numeric = y_train.map({'left': 0, 'right': 1, 'center': 2})
y_train_encoded = pd.get_dummies(y_train_numeric).values # creates an array instead of DF

y_test_numeric = y_test.map({'left': 0, 'right': 1, 'center': 2})
y_test_encoded = pd.get_dummies(y_test_numeric).values # creates an array instead of DF

In [81]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model3.fit(X_train, y_train_encoded, epochs=150, callbacks=[early_stopping])

Epoch 1/150
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 113ms/step - accuracy: 0.5544 - loss: 0.9436
Epoch 2/150


  current = self.get_monitor_value(logs)


[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 159ms/step - accuracy: 0.8977 - loss: 0.3606
Epoch 3/150
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 192ms/step - accuracy: 0.9770 - loss: 0.1308
Epoch 4/150
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 149ms/step - accuracy: 0.9962 - loss: 0.0497
Epoch 5/150
[1m135/380[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m37s[0m 155ms/step - accuracy: 0.9985 - loss: 0.0227

KeyboardInterrupt: 

In [None]:
# save
model3.save('nn.keras')

In [83]:
model3.evaluate(X_test,y_test_encoded)

[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.7636 - loss: 0.6613


[0.6448351740837097, 0.7725090980529785]