In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers



In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.3 MB[0m [31m18.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Load the Pegasus model and tokenizer
model_name = 'google/pegasus-large'
pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)
model_transformers = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/articles/political_data_1.csv')  # Replace with the actual path of CSV file
summaries = df[['political party', 'header', 'content']].values.tolist()
labels = df['political party'].tolist()  # actual column name

In [None]:
df['summary'] = df['political party'] + " " + df['header'] + " " + df['content']

In [None]:
df['content'] = df['content'].fillna('')  # Replace NaN values with empty strings
summaries = df['content'].tolist()

max_seq_length = 100  # Define the maximum sequence length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(summaries)
sequences = tokenizer.texts_to_sequences(summaries)
X = pad_sequences(sequences, maxlen=max_seq_length)

In [None]:
label_to_index = {"Left": 0, "Right": 1, "Center": 2}  # Define label-to-index mapping
y = [label_to_index[label] for label in labels]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class PoliticalAffiliationCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, filter_sizes, num_filters, dropout_prob):
        super(PoliticalAffiliationCNN, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Convolutional layers
        self.conv_layers = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
        ])

        # Fully connected layer
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)

        # Dropout layer
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)  # Add channel dimension

        # Convolutional layers with ReLU activation and max pooling
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.conv_layers]

        # Max pooling over time
        pooled = [F.max_pool1d(conv, conv.size(2)).squeeze(2) for conv in conved]

        # Concatenate the pooled features
        cat = torch.cat(pooled, dim=1)

        # Dropout
        cat = self.dropout(cat)

        # Fully connected layer
        logits = self.fc(cat)

        return F.softmax(logits, dim=1)

# Example usage
vocab_size = len(tokenizer.word_index) + 1  # vocabulary size
embedding_dim = 100  #  embedding dimension
num_classes = 3  # Number of political affiliations
filter_sizes = [3, 4, 5]  # Specify filter sizes for the convolutional layers
num_filters = 100  # Number of filters for each filter size
dropout_prob = 0.5  # Dropout probability

# Create an instance of the PoliticalAffiliationCNN model
model = PoliticalAffiliationCNN(vocab_size, embedding_dim, num_classes, filter_sizes, num_filters, dropout_prob)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train your model (loop through your dataloader and optimize)
num_epochs = 200  # You can adjust the number of epochs here
for epoch in range(num_epochs):
    for data in train_loader:
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Save the trained model
model_filepath = '/content/drive/MyDrive/articles/trained_model/trained_model_5'
state_filepath = '/content/drive/MyDrive/articles/trained_model/trained_state_dict_5'
torch.save(model, model_filepath)
torch.save(model.state_dict(), state_filepath)

# Print out final outputs for a test set
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    all_predicted_classes = []
    all_true_classes = []

    for test_data in test_loader:
        test_inputs, test_labels = test_data
        test_outputs = model(test_inputs)
        _, predicted_classes = torch.max(test_outputs, 1)
        print("Predicted Probabilities:", test_outputs)
        print("Predicted Classes:", predicted_classes)
        print("Actual Classes:", test_labels)

        all_predicted_classes.extend(predicted_classes.numpy())
        all_true_classes.extend(test_labels.numpy())

accuracy = (np.array(all_predicted_classes) == np.array(all_true_classes)).sum().item() / np.array(all_true_classes).shape[0]
print("Test Accuracy:", accuracy)

Predicted Probabilities: tensor([[4.5547e-01, 9.9721e-02, 4.4481e-01],
        [1.6435e-01, 2.2046e-01, 6.1520e-01],
        [1.0000e+00, 1.8380e-07, 2.9298e-07],
        [1.0000e+00, 4.6503e-09, 6.3895e-10],
        [8.9363e-13, 6.6842e-10, 1.0000e+00],
        [1.8237e-01, 2.9660e-01, 5.2103e-01],
        [8.1641e-02, 8.5565e-01, 6.2705e-02],
        [3.3295e-04, 9.9052e-01, 9.1462e-03],
        [6.8736e-13, 6.0781e-10, 1.0000e+00],
        [1.0000e+00, 5.5888e-09, 1.8523e-08],
        [6.5917e-03, 9.8689e-01, 6.5168e-03],
        [1.5739e-02, 9.6035e-01, 2.3915e-02],
        [1.5841e-09, 1.0000e+00, 2.9444e-10],
        [3.0338e-01, 6.1832e-01, 7.8306e-02],
        [3.9561e-06, 9.9999e-01, 5.0739e-06],
        [9.7977e-01, 1.8510e-02, 1.7157e-03],
        [9.8432e-02, 4.8654e-01, 4.1503e-01],
        [8.0360e-01, 9.4359e-02, 1.0204e-01],
        [9.6874e-01, 1.4554e-02, 1.6708e-02],
        [6.4051e-08, 1.0000e+00, 2.8437e-07],
        [1.0160e-07, 1.0000e+00, 1.1107e-07],
        [

In [None]:
#Define the Keras tokenizer
keras_tokenizer = Tokenizer()
keras_tokenizer.fit_on_texts(summaries)

In [None]:
max_length = 1024   # Define the maximum length for both articles and summaries

# Define a function to classify a new article
def classify_article(article):
    loaded_model = torch.load('/content/drive/MyDrive/articles/trained_model/trained_model_3')
    loaded_model.load_state_dict(torch.load('/content/drive/MyDrive/articles/trained_model/trained_state_dict_3'))
    loaded_model.eval()
    sequence = keras_tokenizer.texts_to_sequences([article])  # Use Keras tokenizer
    X_new = pad_sequences(sequence, maxlen=100)
    X_new_tensor = torch.tensor(X_new, dtype=torch.long)
    with torch.no_grad():
        outputs = loaded_model(X_new_tensor)
        _, predicted = torch.max(outputs, 1)
    return outputs, predicted.item()

# Define a function to summarize an article
def summarize_article(article, model_transformers):
    inputs = pegasus_tokenizer.encode(article, return_tensors='pt', truncation=True, max_length=max_length)
    summary_ids = model_transformers.generate(inputs, max_length=max_length, num_beams=5, early_stopping=True)
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Load your external articles
df_new_articles = pd.read_csv('/content/drive/MyDrive/articles/new_articles/articles-2.csv')
articles = df_new_articles['content'].tolist()

# Classify each article
classifications = [classify_article(article) for article in articles]
print(classifications)


[(tensor([[1.0000e+00, 2.8829e-07, 5.6316e-07]]), 0), (tensor([[0.2200, 0.2685, 0.5115]]), 2), (tensor([[0.3777, 0.4262, 0.1961]]), 1)]


In [None]:
articles_combined = " ".join(articles)
summary = summarize_article(articles_combined, model_transformers)

In [None]:
print(type(summary))

<class 'str'>


In [None]:


# Specify the file path
file_path = "output.txt"

# Open the file in write mode
with open(file_path, "w") as file:
    # Write the string to the file
    file.write(summary)

print(f"String saved to {file_path}")


String saved to output.txt


In [None]:
print(classify_article(summary))

(tensor([[0.5033, 0.1737, 0.3229]]), 0)


In [None]:
loaded_model = torch.load('/content/drive/MyDrive/articles/trained_model/trained_model_3')
loaded_model.load_state_dict(torch.load('/content/drive/MyDrive/articles/trained_model/trained_state_dict_3'))
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    all_predicted_classes = []
    all_true_classes = []
    # Assuming test_loader is your DataLoader for the test set
    for test_data in test_loader:
        test_inputs, test_labels = test_data
        test_outputs = loaded_model(test_inputs)
        _, predicted_classes = torch.max(test_outputs, 1)
        print("Predicted Probabilities:", test_outputs)
        print("Predicted Classes:", predicted_classes)
        print("Actual Classes:", test_labels)

        all_predicted_classes.extend(predicted_classes.numpy())
        all_true_classes.extend(test_labels.numpy())

accuracy = (np.array(all_predicted_classes) == np.array(all_true_classes)).sum().item() / np.array(all_true_classes).shape[0]
print("Test Accuracy:", accuracy)

Predicted Probabilities: tensor([[9.6321e-01, 9.2107e-03, 2.7575e-02],
        [2.8793e-08, 1.3262e-07, 1.0000e+00],
        [1.3096e-01, 3.9653e-02, 8.2939e-01],
        [1.7244e-02, 4.6503e-02, 9.3625e-01],
        [1.0000e+00, 1.6429e-07, 5.5911e-07],
        [2.7415e-01, 6.4226e-01, 8.3588e-02],
        [3.6455e-01, 1.7751e-01, 4.5794e-01],
        [4.8929e-03, 2.1796e-01, 7.7715e-01],
        [5.9306e-01, 4.0539e-01, 1.5597e-03],
        [4.5348e-02, 7.6116e-03, 9.4704e-01],
        [9.9976e-01, 1.7915e-04, 6.1172e-05],
        [2.7256e-01, 6.5970e-01, 6.7741e-02],
        [3.6787e-01, 5.9417e-01, 3.7957e-02],
        [8.6087e-03, 1.5683e-02, 9.7571e-01],
        [2.7688e-03, 9.9712e-01, 1.1400e-04],
        [5.8780e-04, 1.1920e-04, 9.9929e-01],
        [9.9996e-01, 3.5735e-05, 1.9871e-06],
        [1.9802e-04, 9.9980e-01, 5.6791e-06],
        [1.2442e-04, 2.1072e-02, 9.7880e-01],
        [6.8558e-01, 2.6585e-01, 4.8568e-02],
        [1.2189e-01, 8.7632e-01, 1.7833e-03],
        [

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
from collections import Counter
from nltk import word_tokenize, pos_tag, ne_chunk

def extract_entities(text):
    # Use NLTK for named entity recognition (NER)
    tree = ne_chunk(pos_tag(word_tokenize(text)))
    entities = [chunk for chunk in tree if isinstance(chunk, tuple)]
    return entities

# Example usage
input_articles = articles
generated_summary = summary

# Extract entities from input articles and summary
input_entities = [entity for article in input_articles for entity in extract_entities(article)]
summary_entities = extract_entities(generated_summary)

# Calculate entity distribution
input_entity_distribution = Counter(input_entities)
summary_entity_distribution = Counter(summary_entities)

# Print entity distributions
print("Entity Distribution in Input Articles:", input_entity_distribution)
print("Entity Distribution in Generated Summary:", summary_entity_distribution)


Entity Distribution in Input Articles: Counter({(',', ','): 90, ('the', 'DT'): 80, ('.', '.'): 72, ('and', 'CC'): 53, ('a', 'DT'): 46, ('of', 'IN'): 42, ('to', 'TO'): 39, ('’', 'NNP'): 33, ('in', 'IN'): 30, ('on', 'IN'): 28, ('his', 'PRP$'): 23, ('for', 'IN'): 22, ('is', 'VBZ'): 22, ('order', 'NN'): 17, ('The', 'DT'): 17, ('s', 'NN'): 15, ('case', 'NN'): 14, ('has', 'VBZ'): 14, ('”', 'NNP'): 14, ('are', 'VBP'): 14, ('as', 'IN'): 13, ('judge', 'NN'): 12, ('post', 'NN'): 12, ('with', 'IN'): 12, ('fraud', 'NN'): 11, ('about', 'IN'): 11, ('said', 'VBD'): 11, ('trial', 'NN'): 11, ('that', 'IN'): 11, ('by', 'IN'): 11, ('we', 'PRP'): 11, ('business', 'NN'): 10, ('everyone', 'NN'): 10, ('not', 'RB'): 10, ('Tuesday', 'NNP'): 9, ('court', 'NN'): 9, ('“', 'JJ'): 9, ('gag', 'NN'): 8, ('after', 'IN'): 8, ('my', 'PRP$'): 8, ('staff', 'NN'): 8, ('s', 'VBD'): 8, ('this', 'DT'): 8, ('against', 'IN'): 8, ('$', '$'): 8, ('former', 'JJ'): 8, ('“', 'VB'): 7, ('(', '('): 7, (')', ')'): 7, ('but', 'CC'): 7, 

In [None]:
def calculate_coverage(generated_summary, input_articles):
    # Tokenize input articles and summary
    input_tokens = set(word_tokenize(" ".join(input_articles)))
    summary_tokens = set(word_tokenize(generated_summary))

    # Calculate coverage
    entity_coverage = len(summary_tokens.intersection(input_tokens)) / len(input_tokens)
    return entity_coverage

entity_coverage = calculate_coverage(summary, articles)
print("Entity and Keyword Coverage:", entity_coverage)

NameError: ignored

In [None]:
from textblob import TextBlob

def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    return sentiment_score

input_sentiment_scores = [analyze_sentiment(article) for article in articles]
summary_sentiment_score = analyze_sentiment(summary)

# Print sentiment scores
print("Sentiment Scores for Input Articles:", input_sentiment_scores)
print("Sentiment Score for Generated Summary:", summary_sentiment_score)


Sentiment Scores for Input Articles: [0.03284215784215785, 0.0780619477987899, -0.0011415172129457826]
Sentiment Score for Generated Summary: 0.016450216450216448


In [None]:
!pip install nltk textblob bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
from bert_score import score
lst = [summary]
lang = "en"

_, _, bert_score_input = score(lst, [" ".join(articles)], lang=lang)

# Print BERTScore for comparison
print("BERTScore for Generated Summary vs. Input Articles:", bert_score_input.mean().item())


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore for Generated Summary vs. Input Articles: 0.9130830764770508
