<a href="https://colab.research.google.com/github/yiboliu/AIPI540-NLP/blob/main/AIPI540_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from zipfile import ZipFile
import copy
import time
import csv
import gensim
import pickle
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import networkx as nx
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def unzip_file(file_name):
    with ZipFile(file_name, 'r') as zf:
        zf.extractall()
        print('Done')

Below are the functions for selecting words from the given tweets

In [57]:
def preprocess(sentence):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer() # lemmatize each word
    # stemmer = PorterStemmer() # stem each word
    sentence = sentence.lower() # to lower cases
    tokenizer = RegexpTokenizer(r'\w+') # tokenize and remove punctuations
    words = tokenizer.tokenize(sentence)
    results = []
    for word in words:
        if word in stop_words:
            continue
        lemmatized = lemmatizer.lemmatize(word)
        # results.append(stemmer.stem(lemmatized))
        results.append(lemmatized)

    return results

def select_words_for_each_sent(preprocessed, window_size):
    G = nx.Graph()
    for word in preprocessed:
        G.add_node(word)

    for i in range(len(preprocessed)):
        for distance in range(1, window_size):
            if i + distance >= len(preprocessed):
                break
            left = preprocessed[i]
            right = preprocessed[i+distance]
            if G.has_edge(left, right):
                G[left][right]['weight'] += 1
            else:
                G.add_edge(preprocessed[i], preprocessed[i+distance], weight=1)
    scores = nx.pagerank(G)
    selected = [k for k in scores if scores[k] >= float(1.0 / len(scores))]
    return " ".join(selected)

def select_words(file_name):
    df = pd.DataFrame(columns=['textID', 'selected_text', 'sentiment'])
    with open(file_name, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        cnt = 0
        for row in reader:
            preprocessed = preprocess(row['text'])
            selected_text = select_words_for_each_sent(preprocessed, 2)
            if not selected_text:
                continue
            cnt += 1
            df = pd.concat([df, pd.DataFrame({'textID': [row['textID']], 'selected_text': [selected_text], 'sentiment': [row['sentiment']]})])
        df.index = pd.RangeIndex(start=0,stop=cnt, step=1)
    return df

This code block is for processing the selected keywords to find its sentiment. TextClassification in both DL and non DL will be used for that purpose. The DL approach is PyTorch embeddingbag and linear layer. The non DL is Logistic regression

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch import nn
from torchtext import data
from torch.nn.utils.rnn import pad_sequence

In [None]:
def build_datasets(df):
    train_iter = [(label, text) for label, text in zip(df['sentiment'].to_list(), df['selected_text'].to_list())]

    train_dataset = to_map_style_dataset(train_iter)

    num_train = int(len(train_dataset) * 0.8)
    num_val = len(train_dataset) - num_train
    train_dataset, val_dataset = random_split(train_dataset, [num_train, num_val])
    return train_dataset, val_dataset

def yield_tokens(data_iter, tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

def build_vocab(data_iter):
    tokenizer = get_tokenizer('spacy')
    vocab = build_vocab_from_iterator(yield_tokens(data_iter, tokenizer), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    return vocab, tokenizer

def collate_fn(batch, tokenizer, vocab):
    mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: mapping[x]
    label_list, text_list, offsets = [], [], [0]
    for (label, text) in batch:
        # print(f'text is {text}')
        label_list.append(label_pipeline(label))
        # print(f'label is {label}')
        # print(len(label_list))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        # print(f'processed_text is {processed_text}')
        # print(f'processed_text.size(0) is {processed_text.size(0)}')
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    # print(f'label_list length is {label_list.size(0)}')
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # offsets = torch.tensor(offsets)
    # print(f"Processed text lengths: {[text.size(0) for text in text_list]}")
    text_list = torch.cat(text_list)
    # print(f'max offset: {offsets[-1]}, textlist length {text_list.size(0)}')
    # print(f"Offsets: {offsets}")

    return label_list.to(device), text_list.to(device), offsets.to(device)

def get_dataloader(dataset, batch_size, vocab, tokenizer):
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn = lambda x: collate_fn(x, tokenizer, vocab))

def build_dataloaders(df):
    train_dataset, val_dataset = build_datasets(df)
    vocab, tokenizer = build_vocab(train_dataset)
    batch_size = 128
    train_dataloader = get_dataloader(train_dataset, batch_size, vocab, tokenizer)
    val_dataloader = get_dataloader(val_dataset, batch_size, vocab, tokenizer)
    dataloaders = {'train': train_dataloader, 'val': val_dataloader}
    dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)}
    return dataloaders, dataset_sizes, vocab


In [96]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        # Embedding layer
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, mode="mean",sparse=True)
        # Fully connected final layer to convert embeddings to output predictions
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        # print(f'text is {text.shape}')
        # print(f'offsets is {offsets.shape}')
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, scheduler, num_epochs=25):
    model = model.to(device)
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for labels, text, offsets in dataloaders[phase]:
                text = text.to(device)
                labels = labels.to(device)
                offsets = offsets.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model.forward(text, offsets)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * text.size(0)
                _, preds = torch.max(outputs, 1)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'train':
              scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

def launch_training_dl(dataloaders, dataset_sizes, vocab, model_path):
    num_classes = 3
    vocab_size = len(vocab)
    embed_dim = 64
    model = TextClassificationModel(vocab_size, embed_dim, num_classes).to(device)
    epochs = 100
    lr = 5.
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 2.0, gamma=0.9)
    model = train_model(model, dataloaders, dataset_sizes, criterion, optimizer, scheduler, num_epochs=epochs)
    torch.save(model.state_dict(), model_path)


In [76]:
def build_features(data, ngram_range):
    print(data.shape)
    vec = TfidfVectorizer(ngram_range=ngram_range)
    transformed = vec.fit_transform(data)
    print(transformed.shape)
    return transformed, vec

def train_and_test(X, y):
    num_folds = 5
    model = LogisticRegression(solver='saga', max_iter=5000)
    for i in range(num_folds):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        print(accuracy_score(y_val, preds))
    return model


In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
unzip_file('tweet-sentiment-extraction.zip')
df = select_words('train.csv')


Done


In [None]:
def get_stats_df(df):
    vals = []
    for item in df['selected_text']:
      vals.append(len(item.split()))
    counts = np.array(vals)
    return np.mean(counts), counts.max()

In [97]:
dataloaders, dataset_sizes, vocab = build_dataloaders(df)
launch_training_dl(dataloaders, dataset_sizes, vocab, 'model-dl.pth')




Epoch 1/100
----------
train Loss: 4.9848 Acc: 0.4773
train Loss: 4.9848 Acc: 0.4773
val Loss: 4.7073 Acc: 0.5257
val Loss: 4.7073 Acc: 0.5257
Epoch 2/100
----------
train Loss: 4.5703 Acc: 0.5419
train Loss: 4.5703 Acc: 0.5419
val Loss: 4.5990 Acc: 0.5545
val Loss: 4.5990 Acc: 0.5545
Epoch 3/100
----------
train Loss: 4.2905 Acc: 0.5849
train Loss: 4.2905 Acc: 0.5849
val Loss: 4.5830 Acc: 0.5562
val Loss: 4.5830 Acc: 0.5562
Epoch 4/100
----------
train Loss: 4.1102 Acc: 0.6106
train Loss: 4.1102 Acc: 0.6106
val Loss: 4.6588 Acc: 0.5596
val Loss: 4.6588 Acc: 0.5596
Epoch 5/100
----------
train Loss: 3.9522 Acc: 0.6365
train Loss: 3.9522 Acc: 0.6365
val Loss: 4.7191 Acc: 0.5604
val Loss: 4.7191 Acc: 0.5604
Epoch 6/100
----------
train Loss: 3.8172 Acc: 0.6532
train Loss: 3.8172 Acc: 0.6532
val Loss: 5.5951 Acc: 0.4923
val Loss: 5.5951 Acc: 0.4923
Epoch 7/100
----------
train Loss: 3.7017 Acc: 0.6657
train Loss: 3.7017 Acc: 0.6657
val Loss: 4.9061 Acc: 0.5567
val Loss: 4.9061 Acc: 0.5567

In [75]:
def launch_training_nondl(df, model_path):
    avg, max = get_stats_df(df)
    print(max)

    X_train, vec = build_features(df['selected_text'], (1, max))
    y_train = df['sentiment']
    model = train_and_test(X_train, y_train)
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    return model, vec

model, vec = launch_training_nondl(df, 'model-lr.pkl')

20
(27408,)
(27408, 368121)
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033
<class 'scipy.sparse._csr.csr_matrix'>
(5482, 368121)
0.5726012404232033


In [81]:
def serve_model(model_path, vec, input):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    preprocessed = preprocess(input)
    words = select_words_for_each_sent(preprocessed, 2)
    print(words)
    print(df.head())
    trans = vec.transform([words])
    print(trans.shape)
    features = trans.toarray()

    print(model.predict(features))

serve_model('model-lr.pkl', vec, 'that was a good game')


good game
       textID        selected_text sentiment
0  cb774db0d1      responded going   neutral
1  549e992a42         sad miss san  negative
2  088c60f138         bos bullying  negative
3  9642c003ef                leave  negative
4  358bd9e861  put release already  negative
(1, 368121)
['positive']


In [102]:
def serve_model_dl(model_path, vocab, input):
    mapping = {2: 'positive', 1: 'neutral', 0: 'negative'}
    model = TextClassificationModel(len(vocab), 64, 3).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    preprocessed = preprocess(input)
    words = select_words_for_each_sent(preprocessed, 2)
    print(words)
    nums = [vocab.get_stoi()[word] for word in words.split()]
    print(nums)
    input = torch.tensor(nums).unsqueeze(0).to(device)
    output = model(text=input, offsets=None)
    print(output)
    print(mapping[output.argmax().item()])

serve_model_dl('model-dl.pth', vocab, 'I am so happy')

happy
[37]
tensor([[-5.1556, -1.0620,  7.2979]], grad_fn=<AddmmBackward0>)
positive
