In [1]:
import json
import ijson
import pandas as pd
import numpy as np
import re
import string
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [13]:
def json_to_dataframe(file_path):
    rows = []

    with open(file_path, 'rb') as file:
        # The prefix argument to ijson.items() specifies the path to the sequence of items in the JSON document
        for items in ijson.items(file, 'item'):
            # Process each list of items
            for item in items:  # Assuming each 'item' is a list of dictionaries
                if isinstance(item, dict):  # Ensure that each element in the list is a dictionary
                    title = item.get('title', '')
                    source = item.get('source', '')
                    # Retrieve 'text' field, check if it's a list, and join correctly
                    text_content = item.get('text', '')
                    if isinstance(text_content, list):  # Check if the 'text' field is a list
                        text_content = ''.join(text_content)
                    bias = item.get('bias', '')

                    # Append to rows list as a dictionary
                    rows.append({
                        'title': title,
                        'source': source,
                        'text': text_content,
                        'bias': bias
                    })

    # Convert list of rows to DataFrame
    return pd.DataFrame(rows)

# Example usage:
file_path = 'C://Users//gauth//Desktop//courses//CS5604//project//dataset1//BIGNEWSALIGN_minimized_balanced_labeled_text_combined.json'
data = json_to_dataframe(file_path)
print(data.head())  # Print the first few rows to verify

                                               title    source  \
0  IL-SCT: White Doesn't Have To Sign Burris Cert...  dailykos   
1             Defiant Burris Says Senate Seat Is His       nyt   
2  Burris Wins at Ill. High Court; Senate Path St...      hill   
3                   Support grows for Burris on Hill       wat   
4  Illinois High Court: Burris Appointment Valid ...       fox   

                                                text        bias  
0  Remember what I said yesterday about the Senat...        Left  
1  CHICAGO Even as Senate leaders continued to ch...   Lean Left  
2  Aspiring Sen. Roland Burris (D) won his certif...      Center  
3  Democratic leaders backpedaled furiously Wedne...  Lean Right  
4  The appointment of Roland Burris to the U.S. S...       Right  


In [14]:
# Save the combined DataFrame to a CSV file
csv_file_path = 'C://Users//gauth//Desktop//courses//CS5604//project//dataset1//combined_data1.csv'  # Specify your file path
data.to_csv(csv_file_path, index=False)


In [18]:
import numpy as np
import pandas as pd
import re
import string
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim import utils
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

def textClean(text):
    """
    Get rid of the non-letter and non-number characters
    """
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    return text

def cleanup(text):
    """
    Clean and truncate text to 1000 words
    """
    text = textClean(text)
    text = " ".join(text[:1000])  # Join the first 1000 words only
    return text

def constructTaggedDocuments(data):
    """
    Construct TaggedDocument objects required for Doc2Vec
    """
    sentences = []
    for index, text in enumerate(data):
        sentences.append(TaggedDocument(utils.to_unicode(text).split(), ['Text_%s' % str(index)]))
    return sentences

def getEmbeddings(path, vector_dimension=300):
    """
    Generate Doc2Vec training and testing data
    """
    data = pd.read_csv(path)

    # Clean and truncate text data
    data['text'] = data['text'].apply(cleanup)

    x = constructTaggedDocuments(data['text'])
    y = data['bias'].values

    model = Doc2Vec(min_count=1, window=5, vector_size=vector_dimension, sample=1e-4, negative=5, workers=7, epochs=10, seed=1)
    model.build_vocab(x)
    model.train(x, total_examples=model.corpus_count, epochs=model.epochs)

    x_train, x_test, y_train, y_test = train_test_split([model.dv[i] for i in range(len(x))], y, test_size=0.2, random_state=42)

    return np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)

# Example usage:
path = 'C://Users//gauth//Desktop//courses//CS5604//project//dataset1//combined_data1.csv'
x_train, x_test, y_train, y_test = getEmbeddings(path)

# Save processed data
np.save('x_train.npy', x_train)
np.save('x_test.npy', x_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)


MemoryError: 

In [3]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from keras.preprocessing import sequence
from collections import Counter
import os
import matplotlib.pyplot as plt
import scikitplot.plotters as skplt
from keras.models import Sequential
from keras.layers import Embedding
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [4]:
top_words = 5000
epoch_num = 5
batch_size = 64

def plot_cmat(yte, ypred):
    '''Plotting confusion matrix'''
    skplt.plot_confusion_matrix(yte, ypred)
    plt.show()

# Read the text data
if not os.path.isfile('./xtr_shuffled.npy') or \
    not os.path.isfile('./xte_shuffled.npy') or \
    not os.path.isfile('./ytr_shuffled.npy') or \
    not os.path.isfile('./yte_shuffled.npy'):
    clean_data()


xtr = np.load('./xtr_shuffled.npy', allow_pickle=True)
xte = np.load('./xte_shuffled.npy', allow_pickle=True)
y_train = np.load('./ytr_shuffled.npy', allow_pickle=True)
y_test = np.load('./yte_shuffled.npy', allow_pickle=True)

cnt = Counter()
x_train = []
for x in xtr:
    x_train.append(x.split())
    for word in x_train[-1]:
        cnt[word] += 1  

# Storing most common words
most_common = cnt.most_common(top_words + 1)
word_bank = {}
id_num = 1
for word, freq in most_common:
    word_bank[word] = id_num
    id_num += 1

# Encode the sentences
for news in x_train:
    i = 0
    while i < len(news):
        if news[i] in word_bank:
            news[i] = word_bank[news[i]]
            i += 1
        else:
            del news[i]

y_train = list(y_train)
y_test = list(y_test)

# Delete the short news
i = 0
while i < len(x_train):
    if len(x_train[i]) > 10:
        i += 1
    else:
        del x_train[i]
        del y_train[i]

# Generating test data
x_test = []
for x in xte:
    x_test.append(x.split())

# Encode the sentences
for news in x_test:
    i = 0
    while i < len(news):
        if news[i] in word_bank:
            news[i] = word_bank[news[i]]
            i += 1
        else:
            del news[i]

# Truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(x_test, maxlen=max_review_length)

# Convert to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing import sequence

# Define constants
top_words = 5000
embedding_vector_length = 32
max_review_length = 500

# Create the model
model = Sequential()

# Add the Embedding layer without input_length
model.add(Embedding(input_dim=top_words + 2, output_dim=embedding_vector_length))

# Add the LSTM layer
model.add(LSTM(units=100))

# Add the Dense output layer
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epoch_num, batch_size=batch_size)

Epoch 1/5
[1m176/251[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m18s[0m 248ms/step - accuracy: 0.7379 - loss: 0.5180