In [1]:
import json
import ijson
import pandas as pd
import numpy as np
import re
import string
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
def json_to_dataframe(file_path):
    rows = []

    with open(file_path, 'rb') as file:
        # The prefix argument to ijson.items() specifies the path to the sequence of items in the JSON document
        for items in ijson.items(file, 'item'):
            # Process each list of items
            for item in items:  # Assuming each 'item' is a list of dictionaries
                if isinstance(item, dict):  # Ensure that each element in the list is a dictionary
                    title = item.get('title', '')
                    source = item.get('source', '')
                    # Retrieve 'text' field, check if it's a list, and join correctly
                    text_content = item.get('text', '')
                    if isinstance(text_content, list):  # Check if the 'text' field is a list
                        text_content = ''.join(text_content)
                    bias = item.get('bias', '')

                    # Append to rows list as a dictionary
                    rows.append({
                        'title': title,
                        'source': source,
                        'text': text_content,
                        'bias': bias
                    })

    # Convert list of rows to DataFrame
    return pd.DataFrame(rows)

# Example usage:
file_path = 'C://Users//gauth//Desktop//courses//CS5604//project//dataset1//BIGNEWSALIGN_min.json'
data = json_to_dataframe(file_path)
print(data.head())  # Print the first few rows to verify

                                               title    source  \
0                 What is Really Fueling Voter Anger  dailykos   
1  Brown's victory in Mass. senate race hardly a ...       wpo   
2  E2 Round-up: What Brown means for climate bill...      hill   
3  Video: Brown camp files criminal complaint aga...       wat   
4          There Are No Safe Seats for Democrats Now       fox   

                                                text        bias  
0  voter’s real anger. They are all highlighting ...        Left  
1  While many are describing the election to fill...   Lean Left  
2  All eyes are on Massachusetts today, the site ...      Center  
3  Dan Winslow, Brown campaign counsel, spoke to ...  Lean Right  
4  Scott Brown's stunning upset victory in Massac...       Right  


In [3]:
# Save the combined DataFrame to a CSV file
csv_file_path = 'C://Users//gauth//Desktop//courses//CS5604//project//dataset1//combined_data2.csv'  # Specify your file path
data.to_csv(csv_file_path, index=False)

In [8]:
def textClean(text):
    """
    Get rid of the non-letter and non-number characters
    """
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return (text)


def cleanup(text):
    text = textClean(text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text


def constructLabeledSentences(data):
    sentences = []
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences


def getEmbeddings(path,vector_dimension=300):
    """
    Generate Doc2Vec training and testing data
    """
    data = pd.read_csv(path)

    missing_rows = []
    for i in range(len(data)):
        if data.loc[i, 'text'] != data.loc[i, 'text']:
            missing_rows.append(i)
    data = data.drop(missing_rows).reset_index().drop(['index','id','source'],axis=1)

    for i in range(len(data)):
        data.loc[i, 'text'] = cleanup(data.loc[i,'text'])

    x = constructLabeledSentences(data['text'])
    y = data['bias'].values

    text_model = Doc2Vec(min_count=1, window=5, vector_size=vector_dimension, sample=1e-4, negative=5, workers=7, epochs=10,
                         seed=1)
    text_model.build_vocab(x)
    text_model.train(x, total_examples=text_model.corpus_count, epochs=text_model.iter)

    train_size = int(0.8 * len(x))
    test_size = len(x) - train_size

    text_train_arrays = np.zeros((train_size, vector_dimension))
    text_test_arrays = np.zeros((test_size, vector_dimension))
    train_labels = np.zeros(train_size)
    test_labels = np.zeros(test_size)

    for i in range(train_size):
        text_train_arrays[i] = text_model.docvecs['Text_' + str(i)]
        train_labels[i] = y[i]

    j = 0
    for i in range(train_size, train_size + test_size):
        text_test_arrays[j] = text_model.docvecs['Text_' + str(i)]
        test_labels[j] = y[i]
        j = j + 1

    return text_train_arrays, text_test_arrays, train_labels, test_labels


def clean_data():
    """
    Generate processed string
    """
    path = 'C:/Users/gauth/Desktop/courses/CS5604/project/dataset1/combined_data2.csv'
    vector_dimension=300

    data = pd.read_csv(path)

    for i in range(len(data)):
        data.loc[i, 'text'] = cleanup(data.loc[i,'text'])

    data = data.sample(frac=1).reset_index(drop=True)

    x = data.loc[:,'text'].values
    y = data.loc[:,'bias'].values

    train_size = int(0.8 * len(y))
    test_size = len(x) - train_size

    xtr = x[:train_size]
    xte = x[train_size:]
    ytr = y[:train_size]
    yte = y[train_size:]

    np.save('xtr_min.npy',xtr)
    np.save('xte_min.npy',xte)
    np.save('ytr_min.npy',ytr)
    np.save('yte_min.npy',yte)


In [9]:
clean_data()

In [10]:
xtr = np.load('./xtr_min.npy', allow_pickle=True)
xte = np.load('./xte_min.npy', allow_pickle=True)
y_train = np.load('./ytr_min.npy', allow_pickle=True)
y_test = np.load('./yte_min.npy', allow_pickle=True)

In [12]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from keras.preprocessing import sequence
from collections import Counter
import os
import matplotlib.pyplot as plt
import scikitplot.plotters as skplt
from keras.models import Sequential
from keras.layers import Embedding
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [14]:
top_words = 5000
epoch_num = 5
batch_size = 64

cnt = Counter()
x_train = []
for x in xtr:
    x_train.append(x.split())
    for word in x_train[-1]:
        cnt[word] += 1  

# Storing most common words
most_common = cnt.most_common(top_words + 1)
word_bank = {}
id_num = 1
for word, freq in most_common:
    word_bank[word] = id_num
    id_num += 1

# Encode the sentences
for news in x_train:
    i = 0
    while i < len(news):
        if news[i] in word_bank:
            news[i] = word_bank[news[i]]
            i += 1
        else:
            del news[i]

y_train = list(y_train)
y_test = list(y_test)

# Delete the short news
i = 0
while i < len(x_train):
    if len(x_train[i]) > 10:
        i += 1
    else:
        del x_train[i]
        del y_train[i]

# Generating test data
x_test = []
for x in xte:
    x_test.append(x.split())

# Encode the sentences
for news in x_test:
    i = 0
    while i < len(news):
        if news[i] in word_bank:
            news[i] = word_bank[news[i]]
            i += 1
        else:
            del news[i]

# Truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(x_test, maxlen=max_review_length)

# Convert to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)


In [16]:
import pandas as pd

# Convert arrays to DataFrames if they aren't already (if needed)
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)
y_train_df = pd.DataFrame(y_train, columns=['Bias'])
y_test_df = pd.DataFrame(y_test, columns=['Bias'])

# Print 5 random samples from each DataFrame
print("5 Random Training Samples:")
print(pd.concat([X_train_df, y_train_df], axis=1).sample(5))
print()

print("5 Random Test Samples:")
print(pd.concat([X_test_df, y_test_df], axis=1).sample(5))

5 Random Training Samples:
         0   1    2     3    4     5     6   7     8    9  ...   491  492  \
134105  48  29  391  3553  141  1386  1449  85  1957  114  ...    29  109   
89954    0   0    0     0    0     0     0   0     0    0  ...   337  178   
25851    0   0    0     0    0     0     0   0     0    0  ...  4009   21   
972      0   0    0     0    0     0     0   0     0    0  ...   176  926   
20102    0   0    0     0    0     0     0   0     0    0  ...     1  147   

         493   494   495   496   497   498   499       Bias  
134105   392   222  2145   654   399  2552   507       Left  
89954    511  1028   592   310  2179  4344     7  Lean Left  
25851    360   122  1847  2543   459   230  2240  Lean Left  
972      214   290   333  1852  1036  3988    29      Right  
20102   1101   351  2244    30   774  2679  2180  Lean Left  

[5 rows x 501 columns]

5 Random Test Samples:
       0    1  2   3   4     5     6   7    8    9  ...  491  492   493   494  \
27152  0 

In [18]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

label_encoder = LabelEncoder()

# Fit the encoder on the training data and transform both training and test data
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Convert encoded labels to numpy arrays (if they aren't already)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Verify the transformation
print("Encoded y_train:", y_train)  # Print first 5 encoded labels from training set
print("Encoded y_test:", y_test) 

Encoded y_train: [0 2 4 ... 4 1 4]
Encoded y_test: [4 1 1 ... 1 4 4]


In [20]:
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of y_train: (137796,)
Shape of y_test: (34449,)


In [21]:
print("Shape of y_train:", X_train.shape)
print("Shape of y_test:", X_test.shape)

Shape of y_train: (137796, 500)
Shape of y_test: (34449, 500)


In [22]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import time

In [23]:
# Initialize the SVM classifier with verbose output and a maximum iteration limit
svm_classifier = SVC(kernel='rbf', class_weight='balanced', decision_function_shape='ovo', verbose=True, max_iter=200)

# Monitor training time
start_time = time.time()

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Measure end time
end_time = time.time()

# Calculate total duration
duration = end_time - start_time
print(f"Training completed in {duration:.2f} seconds")


[LibSVM]Training completed in 12303.41 seconds




In [24]:
# Predict on the test data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of SVM model:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy of SVM model: 0.20276350547185695
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6873
           1       1.00      0.00      0.00      6850
           2       0.20      1.00      0.34      6970
           3       0.40      0.00      0.00      6919
           4       0.00      0.00      0.00      6837

    accuracy                           0.20     34449
   macro avg       0.32      0.20      0.07     34449
weighted avg       0.32      0.20      0.07     34449

Confusion Matrix:
[[   0    0 6873    0    0]
 [   0   11 6836    3    0]
 [   0    0 6970    0    0]
 [   0    0 6915    4    0]
 [   0    0 6834    3    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
from joblib import dump, load
# Save the model to a file
model_filename = 'svm_classifier.joblib'
dump(svm_classifier, model_filename)
print(f"Model saved to {model_filename}")


Model saved to svm_classifier.joblib
