<a href="https://colab.research.google.com/github/surajK610/AI/blob/master/NLP_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import nltk
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()

vader.polarity_scores('happy')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.5719}

In [None]:
import spacy
!python3 -m spacy download en_core_web_sm
# from google.colab import drive

import numpy as np
import pandas as pd
import pickle

num_documents = 5000 # INFO: Feel free to change this to load in less documents for debugging but otherwise keep it at 5000 to train the Topic Model
FILEPATH = "/content/drive/MyDrive/CSCI 1460 - Computational Linguistics/Homework/A2 - Topic Modeling/" # TODO: Update this to the filepath of your copy of the assignment, e.g. /content/drive/MyDrive/Topic Modeling/
nlp = spacy.load('en_core_web_sm')

doc_locations = []
spacy_processed_docs = []

if exists(f"{FILEPATH}spacy_processed_docs_{num_documents}.pkl"):
  with open(f"{FILEPATH}spacy_processed_docs_{num_documents}.pkl", 'rb') as f:
    spacy_processed_docs, doc_locations = pickle.load(f)
    f.close()
else:
  with open(f'{FILEPATH}articles_sampled_data.csv', 'r', encoding='utf-8') as f:
    for i, row in tqdm(enumerate(csv.DictReader(f, delimiter=','))):
      if i == num_documents:
        break
      if i % 500 == 0:
        print("Processing row %d"%i)
      try:
        parsed = nlp(row["content"])
        source_name = row["location"]
      except ValueError:
        continue
      spacy_processed_docs.append(parsed)
      doc_locations.append(source_name)
    f.close()

  with open(f"{FILEPATH}spacy_processed_docs_{num_documents}.pkl", 'wb') as f:
    pickle.dump((spacy_processed_docs, doc_locations), f)
    f.close()

def preprocess(raw_X: List[str]) -> List[List[str]]:
    """
    Performs splitting on whitespace on all raw strings in a list.

    Parameters
    ----------
    raw_X : List[str]
        A list of raw strings (tweets)

    Returns
    -------
    List[List[str]]
        A list of preprocessed tweets (which are now lists of words)
    """
    # TODO Basic tokenization just based on whitespace, with no other preprocessing
    return [x.split() for x in raw_X]

def preprocess_part2(parsed_tweets: List[spacy_doc]) -> List[List[str]]:
    """
    Preprocesses the spacy-parsed tweets.

    Parameters
    ----------
    parsed_tweets : List[spacy_doc]
        A list of tweets parsed by spacy

    Returns
    -------
        A list of preprocessed tweets formatted as lists of tokens (lists of strings)
    """
    preproc = []
    for r in parsed_tweets:
        words = []
        for word in r:
            if not (word.is_stop or word.is_punct or word.is_space):
                if word.pos_ == "NUM" or word.like_num:
                    words.append("<NUM>")
                else:
                    words.append(word.lemma_.lower().strip())
        preproc.append(words)

    counts = {}
    for r in preproc:
        for word in r:
            counts[word] = counts.get(word, 0) + 1
    K = 1000
    topk = [w[0] for w in sorted(counts.items(), key=lambda x: x[1], reverse=True)[:K]]

    return [[w if w in topk else "<OOV>" for w in r] for r in preproc]

from scipy.sparse import csr_matrix
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


class BOW_Classifier:
    """
    Attributes
    ----------
    clf : LogisticRegression
        A logistic regression classifier
    dv : DictVectorizer
        A dictionary vectorizer for turning dictionaries into matrices
    """

    def __init__(self):
        self.clf = LogisticRegression(max_iter=150)
        self.dv = None  # You are allowed to not use this attribute as well as add more to the init method if you wish

    def featurize(
        self, preproc_X: np.ndarray[List[str]], is_test: bool = False
    ) -> csr_matrix:
        """
        Turns a list of preprocessed tweets into a binary bag of words
        matrix.

        Parameters
        ----------
        preproc_X : np.ndarray[List[str]]
            A list of preprocessed tweets
        is_test: bool, default=False
            Whether featurization should be done using features learned during training (is_test=True)
            or whether it should be done with features extracted from scratch using preproc_X (is_test=False)

        Returns
        -------
        csr_matrix
            A matrix with rows corresponding to tweets and columns corresponding to words
        """
        dicts = [{w: 1 for w in x} for x in preproc_X]

        if is_test:
            return self.dv.transform(dicts)
        else:
            self.dv = DictVectorizer()
            X = self.dv.fit_transform(dicts)
        return X

    def train(self, X_train: np.ndarray[List[str]], y_train: np.ndarray[int]):

        X_train_feat = self.featurize(X_train)
        self.clf.fit(X_train_feat, y_train)

    def test(self, X_test: np.ndarray[List[str]]) -> np.ndarray[int]:

        X_test_feat = self.featurize(X_test, is_test=True)
        return self.clf.predict(X_test_feat)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


def run_kfold_crossval(
    model: BOW_Classifier, X: List[List[str]], y: List[int], k: int = 5
) -> List[float]:
    accs = []

    X, y = np.array(X, dtype=list), np.array(y)
    skf = StratifiedKFold(n_splits=k)

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test, y_train, y_test = (
            X[train_idx],
            X[test_idx],
            y[train_idx],
            y[test_idx],
        )
        model.train(X_train, y_train)

        y_pred = model.test(X_test)
        accs.append(accuracy_score(y_test, y_pred))

    return accs

2023-10-26 12:30:14.054447: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-26 12:30:14.054522: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-26 12:30:14.054575: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now l

NameError: ignored

In [None]:
from collections import Counter
def binary_term_doc_matrix(docs : List[spacy.tokens.Doc]) -> Tuple[np.ndarray[np.float64], Dict[int, str]]:
  c = Counter()
  word2idx = {}
  idx2word = {}

  for doc in docs:
    c.update([token.lemma_.lower() for token in doc])

  most_common = [word for word, _ in c.most_common()]
  for rank in range(len(most_common)):
    word = most_common[rank]
    word2idx[word] = rank
    idx2word[rank] = word

  M = np.zeros((len(docs), len(most_common)))
  for i in range(len(docs)):
    for token in docs[i]:
      M[i, word2idx[token.lemma_.lower()]] = 1

  return M, idx2word

from sklearn.decomposition import LatentDirichletAllocation

def train_topic_model(term_doc_mat : np.ndarray[np.float64], n_topics : int = 10, random_state = 42) -> LatentDirichletAllocation:
  lda = LatentDirichletAllocation(n_components=n_topics, random_state = random_state)
  lda.fit(term_doc_mat)
  return lda

from collections import Counter # HINT: you may find this useful
import math

def preprocess_doc(doc : spacy.tokens.Doc) -> List[str]:
  proc_doc = []
  for token in doc:
    if not (token.is_stop or token.is_punct or token.pos_=='SPACE'):
      proc_doc.append(token.lemma_.lower())
  return proc_doc

def create_vocab(proc_docs : List[List[str]], vocab_cutoff : int = 5000) -> List[str]:
  vocab = Counter()
  for proc_doc in proc_docs:
    vocab.update(proc_doc)

  vocab = vocab.most_common(vocab_cutoff)
  vocab = [word[0] for word in vocab]
  return vocab

In [None]:
from transformers import DistilBertForSequenceClassification
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import torch, codecs, random
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import datasets
from datasets import load_metric
from google.colab import output
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Dict, Any
from transformers import PreTrainedTokenizer
from torch.optim import Optimizer

device = torch.device("cuda")
class TweetDataset(torch.utils.data.Dataset):
  """
  A PyTorch Dataset for our tweets that can be iterated through using __getitem__
  """
  def __init__(self, tweets : List[str], sentiments : List[int], tokenizer : PreTrainedTokenizer) -> None:
    """
    Initializes the TweetDataset from a list of tweets, their corresponding sentiments, and a tokenizer.

    Attributes
    ----------
    tweets : List[str]
      A list of tweets, where each tweet is a string
    sentiments: List[int]
      A list of sentiments represented as integers ('negative': 0, 'neutral': 1, 'positive': 2)
    tokenizer : PreTrainedTokenizer
      Any PreTrainedTokenizer from HuggingFace can be used to encode the string inputs for a model
    """
    self.tweets = tweets
    self.sentiments = sentiments
    self.tokenizer = tokenizer
    self.max_len = tokenizer.model_max_length

  def __len__(self) -> int:
    """
    Returns the number of tweets in the dataset.
    """
    return len(self.tweets)

  def __getitem__(self, index : int) -> Dict[str, Any]:
    """
    Retrieve a preprocessed data item from the dataset at the specified index.
    This is called when iterating through a TweetDataset

    Parameters:
    ----------
    index : int
        The index of the data item to retrieve.

    Returns:
    -------
    Dict[str, Any]
        A dictionary containing the preprocessed data for the given index.
        The dictionary includes the following keys:
        - 'input_ids': Encoded input IDs for the tweet.
        - 'attention_mask': Attention mask for the tweet.
        - 'labels': Sentiment label as a PyTorch tensor.
    """
    tweet = str(self.tweets[index])
    sentiments = self.sentiments[index]

    encoded_tweet = self.tokenizer.encode_plus(
      tweet,
      add_special_tokens      = True,
      max_length              = self.max_len,
      return_token_type_ids   = False,
      return_attention_mask   = True,
      return_tensors          = "pt",
      padding                 = "max_length",
      truncation              = True
    )

    return {
      'input_ids': encoded_tweet['input_ids'][0],
      'attention_mask': encoded_tweet['attention_mask'][0],
      'labels': torch.tensor(sentiments, dtype=torch.long)
    }

# use this for the tokenizer argument of the TweetDataset
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# define the following TweetDataset objects... be careful to split the data as previously specified
train_dataset = TweetDataset(tweets = trainset, sentiments = trainlabs, tokenizer = tokenizer)
validation_dataset = TweetDataset(tweets = valset, sentiments = vallabs, tokenizer = tokenizer)
test_dataset = TweetDataset(tweets = testset, sentiments = testlabs, tokenizer = tokenizer)

# now construct DataLoader objects from the TweetDataset objects
# remember that the TweetDataset class is a child class of torch.utils.data.Dataset
train_dataloader = DataLoader(train_dataset, batch_size=32)
validation_dataloader = DataLoader(validation_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=64)

def update_metrics(metrics: List[datasets.Metric], predictions: torch.Tensor, labels: torch.Tensor) -> None:
  """
  Update a list of metrics with new predictions and labels

  Parameters:
  -----------
  metrics : List[Metric]
      List of metrics.
  predictions : torch.Tensor
      Tensor of predictions of shape (batch_size, ...)
  labels : torch.Tensor
      Tensor of labels of shape (batch_size, ...)

  Returns:
  --------
  None
  """
  # Nothing TODO here! This updates metrics based on a batch of predictions
  # and a batch of labels.
  for metric in metrics:
    metric.add_batch(predictions=predictions, references=labels)

def evaluate(model: torch.nn.Module, test_dataloader: torch.utils.data.DataLoader,
             device: torch.device, metric_strs: List[str]) -> Dict[str, float]:
  """
  Evaluate a PyTorch Model

  Parameters:
  -----------
  model : torch.nn.Module
      The model to be evaluated.
  test_dataloader : torch.utils.data.DataLoader
      DataLoader containing testing examples.
  device : torch.device
      The device that the evaluation will be performed on.
  metric_strs : List[str]
      The names of Hugging Face metrics to use.

  Returns:
  --------
  Dict[str, float]
      Dictionary of metric names mapped to their values.
  """
  # load metrics
  metrics = [load_metric(x) for x in metric_strs] # could add more here!
  model.eval()

  # we like progress bars :)
  progress_bar = tqdm(range(len(test_dataloader)))
  # HINT: progress_bar.update(1) should be used to show progress after an iteration

  # TODO: Fill in the evaluate function by applying the model with the dataloader

  for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}

    # deactivate the autograd engine, which saves memory and will speed this up
    with torch.no_grad():
      # perform forward pass
      outputs = model(**batch)

    # convert logits to the predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Update the metrics
    update_metrics(metrics, predictions, batch["labels"])

    progress_bar.update(1)

  # compute and return metrics
  computed = {}
  for m in metrics:
    computed = {**computed, **m.compute()}

  return computed

def train(model: torch.nn.Module, optimizer: Optimizer, num_epochs: int,
          train_dataloader: DataLoader, validation_dataloader: DataLoader,
          lr_scheduler: Any, device: torch.device) -> None:
  for epoch in range(num_epochs):
    # put the model in training mode (important that this is done each epoch,
    # since we put the model into eval mode during validation)
    model.train()

    # load metrics
    metrics = [load_metric(x) for x in ["accuracy"]] # could add more here!

    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_dataloader)))

    # TODO: Fill in the rest of the train function by applying the model with the dataloader
    for i, batch in enumerate(train_dataloader):
      batch = {k: v.to(device) for k, v in batch.items()}

      # forward pass
      outputs = model(**batch)

      # give predictions and labels to metrics
      # requires that we convert logits to predictions
      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      update_metrics(metrics, predictions, batch["labels"])

      # backpropagation
      loss = outputs.loss
      loss.backward()

      # update parameters with optimizer
      optimizer.step()

      # update learning rate
      lr_scheduler.step()

      # clear gradients
      optimizer.zero_grad()
      progress_bar.update(1)
    # print the epoch's average metrics
    print(f"Epoch {epoch+1} average training metrics: accuracy={metrics[0].compute()['accuracy']}")

    # normally, validation would be more useful when training for many epochs
    print("Running validation:")
    # TODO: evaluate model on validation dataset
    val_metrics = evaluate(model, validation_dataloader, device, ['accuracy'])
    print(f"Epoch {epoch+1} validation: accuracy={val_metrics['accuracy']}")

pretrained_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
