In [5]:
import numpy as np
## Matrix Handling

import pandas as pd
## Dataframe Handling

from sklearn.model_selection import train_test_split
## Splitting the dataset

from sklearn.linear_model import LogisticRegression
## The Classification Model which will train on the Embed Vectors from distilBert

from sklearn.model_selection import GridSearchCV
## Some Tuning

from sklearn.model_selection import cross_val_score
## Scoring

## We will be implementing the DistilBert using PyTorch
import torch
import transformers as ppb
## Pytorch Transformers 

import warnings
warnings.filterwarnings('ignore')
## Ignore Warnings

import time

In [21]:
def distilbert(df_name):

    df=pd.read_csv(df_name)
    df = df.dropna()
    df=df[:500]
    x=df["sw_exclude"].values.tolist()
    y=df["sentiment"].tolist()

    ## change -1 to 2
    for i in range(len(y)):
        if y[i] == -1:
            y[i] = 2

    # Load pretrained model/tokenizer
    time_start = time.perf_counter()
    model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)   
    tokenized = df["sw_exclude"].apply((lambda x: tokenizer.encode(x, truncation=True,add_special_tokens=True)))

    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

    attention_mask = np.where(padded != 0, 1, 0)

    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:,0,:].numpy()
    
    labels = df["sentiment"]
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state= 100, test_size = 0.3)
    parameters = {'C': np.linspace(0.001, 100, 20)}
    
    grid_search = GridSearchCV(LogisticRegression(), parameters)
    grid_search.fit(train_features, train_labels)
    print('best parameters: ', grid_search.best_params_)
    lr_clf = LogisticRegression(C = grid_search.best_params_.get('C'))
    lr_clf.fit(train_features, train_labels)
    time_end = time.perf_counter()
    print("Acc: ",lr_clf.score(test_features, test_labels))
    time_sum = time_end - time_start
    print("Time: ",time_sum)


In [22]:
distilbert("email_preprocess.csv")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


best parameters:  {'C': 0.001}
Acc:  0.5
Time:  73.95782000000054


In [23]:
distilbert("covid_preprocess.csv")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


best parameters:  {'C': 5.264105263157894}
Acc:  0.7133333333333334
Time:  113.19737150000037


In [17]:
distilbert("news_preprocess.csv")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


best parameters:  {'C': 0.001}
Acc:  0.8066666666666666
Time:  281.5382407999996


In [18]:
distilbert("twitter_preprocess.csv")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


best parameters:  {'C': 5.264105263157894}
Acc:  0.6733333333333333
Time:  114.68295240000043


In [19]:
distilbert("review_preprocess.csv")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


best parameters:  {'C': 0.001}
Acc:  0.7833333333333333
Time:  256.27579700000024
