In [2]:
import pandas as pd
import spacy

In [14]:
df = pd.read_csv('political_articles2.csv')

In [15]:
df.head()
# although the 'text' column is short, we will try running basic ML

Unnamed: 0,text,label
0,Pfizer says its Covid-19 vaccine is safe and 1...,Left Data
1,The NBC-Wall Street Journal's latest national ...,Left Data
2,"September 19th, 2012 06:58 AM ET...",Left Data
3,Story highlightsCNN/ORC International poll: Si...,Left Data
4,"Washington (CNN)Dr. Anthony Fauci, the nation'...",Left Data


In [16]:
df['label'] = df['label'].apply(lambda x: 'left' if x == 'Left Data' else ('right' if x == 'Right Data' else 'center'))
df

Unnamed: 0,text,label
0,Pfizer says its Covid-19 vaccine is safe and 1...,left
1,The NBC-Wall Street Journal's latest national ...,left
2,"September 19th, 2012 06:58 AM ET...",left
3,Story highlightsCNN/ORC International poll: Si...,left
4,"Washington (CNN)Dr. Anthony Fauci, the nation'...",left
...,...,...
17357,"By Daniel De SimoneBBC NewsImage source, COUNT...",center
17358,...,center
17359,...,center
17360,President BidenJoe BidenUS could spend M month...,center


In [19]:
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

In [20]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ''  # or return None or some other placeholder
    doc = nlp(text)
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(processed_tokens)  # Join back into a string (if desired)

df['text_proc'] = df['text'].apply(preprocess_text)


In [21]:
df['text_proc']

0        Pfizer say Covid-19 vaccine safe 100 percent e...
1        NBC Wall Street Journal late national poll rel...
2                     September 19th 2012    06:58 et  ...
3        story highlightsCNN ORC International poll 10 ...
4        Washington CNN)Dr Anthony Fauci nation infecti...
                               ...                        
17357    Daniel De SimoneBBC NewsImage source COUNTER t...
17358                                                  ...
17359                                                  ...
17360    President BidenJoe bidenus spend M monthly tes...
17361    telecommunication worker Chris Viens Guy Glove...
Name: text_proc, Length: 17362, dtype: object

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_proc'])

# X_dense is the feature vectors

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.3)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [35]:
pred = model.predict(X_test)

In [36]:
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

      center       0.89      0.63      0.74      1236
        left       0.70      0.90      0.79      2279
       right       0.80      0.67      0.73      1694

    accuracy                           0.76      5209
   macro avg       0.80      0.73      0.75      5209
weighted avg       0.78      0.76      0.76      5209

[[ 773  371   92]
 [  40 2049  190]
 [  54  511 1129]]


In [37]:
import joblib

In [29]:
joblib.dump(model, 'large_data_log.joblib')

['large_data_log.joblib']

In [42]:
df.to_csv('preprocessed_data.csv', index=False)

In [40]:
data = pd.read_csv('preprocessed_data.csv')

Unnamed: 0,text,label,text_proc
0,Pfizer says its Covid-19 vaccine is safe and 1...,left,Pfizer say Covid-19 vaccine safe 100 percent e...
1,The NBC-Wall Street Journal's latest national ...,left,NBC Wall Street Journal late national poll rel...
2,"September 19th, 2012 06:58 AM ET...",left,September 19th 2012 06:58 et ...
3,Story highlightsCNN/ORC International poll: Si...,left,story highlightsCNN ORC International poll 10 ...
4,"Washington (CNN)Dr. Anthony Fauci, the nation'...",left,Washington CNN)Dr Anthony Fauci nation infecti...
...,...,...,...
17357,"By Daniel De SimoneBBC NewsImage source, COUNT...",center,Daniel De SimoneBBC NewsImage source COUNTER t...
17358,...,center,...
17359,...,center,...
17360,President BidenJoe BidenUS could spend M month...,center,President BidenJoe bidenus spend M monthly tes...


In [41]:
joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']