In [1]:
import pandas as pd
from sklearn import metrics
from tqdm import tqdm
import numpy as np
import re

In [2]:
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import AdamW

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
# load data
dtypes = {'cfips': str}
df = pd.read_csv('gpt_pro_con_no_null.csv', dtype=dtypes)
df.head()

Unnamed: 0,cfips,gpt_pro_1,gpt_pro_2,gpt_pro_3,gpt_con_1,gpt_con_2,gpt_con_3
0,1001,"Autauga County has a low cost of living, makin...","The county has a strong business community, pr...",Autauga County is located in the heart of Alab...,Autauga County has a relatively small populati...,The county has a limited number of resources a...,Autauga County is subject to the laws and regu...
1,1003,Baldwin County has a strong economy with a low...,"The cost of living is relatively low, making i...",There are numerous resources available to help...,The local government has strict regulations an...,"The area is prone to natural disasters, such a...",There is a limited pool of skilled labor avail...
2,1005,"Low cost of living in Barbour County, Alabama,...",Access to a large customer base due to the cou...,Access to a variety of resources and support f...,Limited access to capital and financing option...,Lack of access to a skilled workforce due to t...,Limited access to technology and infrastructur...
3,1007,"Low cost of living in Bibb County, Alabama, ma...",Access to a large customer base due to the cou...,Access to a variety of resources and support f...,Limited access to capital and financing option...,Limited access to skilled labor due to the cou...,Limited access to technology and infrastructur...
4,1009,"Low cost of living in Blount County, Alabama, ...",Access to a large customer base due to the cou...,Access to resources such as the Blount County ...,Limited access to venture capital and other fo...,Limited access to skilled labor due to the cou...,Limited access to technology and other resourc...


In [5]:
def read_csv_to_sentences(df):
    docs = []
    targets = []
    for i in range(len(df)):
        docs.append(df['gpt_pro_1'][i])
        docs.append(df['gpt_pro_2'][i])
        docs.append(df['gpt_pro_3'][i])
        docs.append(df['gpt_con_1'][i])
        docs.append(df['gpt_con_2'][i])
        docs.append(df['gpt_con_3'][i])
        targets.append(1)
        targets.append(1)
        targets.append(1)
        targets.append(0)
        targets.append(0)
        targets.append(0)
    return docs, targets

In [6]:
docs, targets = read_csv_to_sentences(df)
final_sentences = docs
final_targets = targets

In [7]:
# train test split
from sklearn.model_selection import train_test_split
train_sentences, test_sentences, train_labels, test_labels = train_test_split(final_sentences, final_targets, 
                                                                                test_size=0.2, random_state=42)

In [8]:
# tokenizer for distilbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# MAX_LEN
MAX_LEN = 256

# BATCH_SIZE
BATCH_SIZE = 16

In [9]:
# data loader
class GPTCommentDataset(Dataset):
    def __init__(self, sentences, targets, tokenizer, max_len):
        self.sentences = sentences
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

train_set = GPTCommentDataset(train_sentences, train_labels, tokenizer, MAX_LEN)
test_set = GPTCommentDataset(test_sentences, test_labels, tokenizer, MAX_LEN)

# data loader for train and test
train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

train_dl = DataLoader(train_set, **train_params)
valid_dl = DataLoader(test_set, **valid_params)

In [17]:
class model(nn.Module):
    def __init__(self, checkpoint, freeze=False, device='cpu'):
        super().__init__()
        
        self.device = device
        self.model = DistilBertModel.from_pretrained(checkpoint)
        if freeze:
            i = 0
            for layer in self.model.parameters():
                i += 1
                layer.requires_grad=False 
        # logistic regression
        self.fc_1 = nn.Linear(768, 52)
        self.relu_1 = nn.ReLU()
        self.fc_2 = nn.Linear(52, 2)
        self.softmax = nn.Softmax(dim=1)

        
    def forward(self, x, attention_mask=None):
        model_out = self.model(x['input_ids'], x['attention_mask'], return_dict=True)
        embds = model_out.last_hidden_state
        mean_pool = embds.sum(axis=1) / x['attention_mask'].sum(axis=1).unsqueeze(axis=1)
        extra_1 = self.relu_1(self.fc_1(mean_pool))
        extra_2 = self.softmax(self.fc_2(extra_1))
        return extra_2

    def embedding(self, x, attention_mask=None):
        model_out = self.model(x['input_ids'], x['attention_mask'], return_dict=True)
        embds = model_out.last_hidden_state
        mean_pool = embds.sum(axis=1) / x['attention_mask'].sum(axis=1).unsqueeze(axis=1)
        extra_1 = self.relu_1(self.fc_1(mean_pool))
        return extra_1

In [18]:
# model
checkpoint = 'distilbert-base-uncased'
distilbert = model(checkpoint, freeze=False)
distilbert.to('cpu')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768

In [19]:
# fine tuning
def train(model, train_dl, valid_dl, optimizer, criterion, epochs, device):
    model = model.to(device)
    criterion = criterion.to(device)
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_acc = 0
        for batch in tqdm(train_dl):
            optimizer.zero_grad()
            output = model(batch)
            loss = criterion(output, batch['targets'])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_acc += (output.argmax(1) == batch['targets']).sum().item()
        model.eval()
        losses = []
        accs = []
        with torch.no_grad():
            for batch in valid_dl:
                output = model(batch)
                loss = criterion(output, batch['targets'])
                losses.append(loss.item())
                accs.append((output.argmax(1) == batch['targets']).sum().item())
        val_loss = np.mean(losses)
        val_acc = np.mean(accs)
        print(f'Epoch {epoch}, Train loss: {train_loss/len(train_dl):.3f}, Train acc: {train_acc/len(train_dl):.3f}')
        print(f'Epoch {epoch}, Val loss: {val_loss:.3f}, Val acc: {val_acc:.3f}')

In [20]:
# fine tune distilbert
optimizer = AdamW(distilbert.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
train(distilbert, train_dl, valid_dl, optimizer, criterion, 2, 'cpu')

100%|██████████| 943/943 [39:42<00:00,  2.53s/it]


Epoch 0, Train loss: 0.320, Train acc: 15.918
Epoch 0, Val loss: 0.314, Val acc: 15.975


  5%|▍         | 44/943 [01:55<39:19,  2.62s/it]


KeyboardInterrupt: 

## Already really good, no need for more training

In [21]:
# save model
torch.save(distilbert.state_dict(), 'finetuned_distilbert.pt')

In [22]:
class embedding_model(nn.Module):
    def __init__(self, checkpoint, freeze=False, device='cpu'):
        super().__init__()
        
        self.device = device
        self.model = DistilBertModel.from_pretrained(checkpoint)
        if freeze:
            i = 0
            for layer in self.model.parameters():
                i += 1
                layer.requires_grad=False 
        # logistic regression
        self.fc_1 = nn.Linear(768, 52)
        self.relu_1 = nn.ReLU()
        self.fc_2 = nn.Linear(52, 2)
        self.softmax = nn.Softmax(dim=1)

        
    def forward(self, x, attention_mask=None):
        model_out = self.model(x['input_ids'], x['attention_mask'], return_dict=True)
        embds = model_out.last_hidden_state
        mean_pool = embds.sum(axis=1) / x['attention_mask'].sum(axis=1).unsqueeze(axis=1)
        extra_1 = self.relu_1(self.fc_1(mean_pool))
        extra_2 = self.softmax(self.fc_2(extra_1))
        return extra_2

    def embedding_0(self, x, attention_mask=None):
        model_out = self.model(x['input_ids'], x['attention_mask'], return_dict=True)
        embds = model_out.last_hidden_state
        mean_pool = embds.sum(axis=1) / x['attention_mask'].sum(axis=1).unsqueeze(axis=1)
        return mean_pool

    def embedding_1(self, x, attention_mask=None):
        model_out = self.model(x['input_ids'], x['attention_mask'], return_dict=True)
        embds = model_out.last_hidden_state
        mean_pool = embds.sum(axis=1) / x['attention_mask'].sum(axis=1).unsqueeze(axis=1)
        extra_1 = self.relu_1(self.fc_1(mean_pool))
        return extra_1

In [23]:
# model
embedding_model = embedding_model(checkpoint, freeze=False)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# load weights
embedding_model.load_state_dict(torch.load('finetuned_distilbert.pt'))

<All keys matched successfully>

In [25]:
# get embeddings
def get_embeddings_0(model, dl, device):
    model = model.to(device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in dl:
            output = model.embedding_0(batch)
            embeddings.append(output)
    return embeddings

def get_embeddings_1(model, dl, device):
    model = model.to(device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in dl:
            output = model.embedding_1(batch)
            embeddings.append(output)
    return embeddings

In [27]:
# get dataset from final_sentences = docs
docs, targets = read_csv_to_sentences(df)
final_sentences = docs
final_targets = targets

data_set = GPTCommentDataset(final_sentences, final_targets, tokenizer, MAX_LEN)

data_params = {'batch_size': BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

data_dl = DataLoader(data_set, **data_params)

# get embeddings
embeddings_0 = get_embeddings_0(embedding_model, data_dl, 'cpu')

In [49]:
# to numpy on all levels
embeddings_0_np = []
for i in range(len(embeddings_0)):
    for emb in embeddings_0[i]:
        embeddings_0_np.append(emb.numpy().tolist())

len(embeddings_0_np) / 6


3142.0

In [50]:
embeddings_0_list = embeddings_0_np

In [51]:
len(embeddings_0_list) / 6

3142.0

In [52]:
len(embeddings_0_list[0])

768

In [53]:
import json

with open('finetuned_distillBERT_embeddings.json', 'w') as f:
    json.dump(embeddings_0_list, f)

In [54]:
positive_embeddings = []
negative_embeddings = []
for i in range(len(embeddings_0_list)):
    if i % 6 == 0 or i % 6 == 1 or i % 6 == 2:
        positive_embeddings.append(embeddings_0_list[i])
    else:
        negative_embeddings.append(embeddings_0_list[i])

positive_sentences = []
negative_sentences = []
for i in range(len(final_sentences)):
    if i % 6 == 0 or i % 6 == 1 or i % 6 == 2:
        positive_sentences.append(final_sentences[i])
    else:
        negative_sentences.append(final_sentences[i])

In [55]:
positive_sentences[:6]

['Autauga County has a low cost of living, making it an affordable place to start a business.',
 'The county has a strong business community, providing resources and support for entrepreneurs.',
 'Autauga County is located in the heart of Alabama, providing easy access to major cities and transportation hubs.',
 'Baldwin County has a strong economy with a low unemployment rate.',
 'The cost of living is relatively low, making it an attractive place to start a business.',
 'There are numerous resources available to help small businesses get started, such as the Small Business Development Center and the Baldwin County Economic Development Alliance.']

In [56]:
from sklearn.cluster import KMeans

# set seed for the KMeans clustering
np.random.seed(222)

positive_clusterer = KMeans(n_clusters=10, verbose=0)
positive_clustered_docs = positive_clusterer.fit_predict(positive_embeddings)

negative_clusterer = KMeans(n_clusters=10, verbose=0)
negative_clustered_docs = negative_clusterer.fit_predict(negative_embeddings)

# positive cluster matric
positive_cluster_labels = positive_clusterer.labels_
positive_cluster_centers = positive_clusterer.cluster_centers_

# negative cluster matric
negative_cluster_labels = negative_clusterer.labels_
negative_cluster_centers = negative_clusterer.cluster_centers_

len(positive_cluster_labels)/3 , len(negative_cluster_labels)/3

(3142.0, 3142.0)

In [57]:
positive_matched_list = list(zip(positive_sentences, positive_cluster_labels))
negative_matched_list = list(zip(negative_sentences, negative_cluster_labels))

In [82]:
def get_cluster_sentences(first, last, some_list, posi=True):
    for i in range(first, last):
        if posi:
            print(f'Positive Cluster {i}:')
        else:
            print(f'Negative Cluster {i}:')
        j = 0
        for sentence, cluster in some_list:
            if cluster == i:
                print(sentence)
                j += 1
            if j == 12:
                break

In [None]:
positive_cluster_names = {
    0: 'Supportive_Community',
    1: 'Low_Cost_of_Living',
    2: 'Government_Support',
    3: 'Large_Customer_Base',
    4: 'Low_Cost_of_Living',
    5: 'Low_Cost_of_Living',
    6: 'Natural_Resources',
    7: 'Connected_Economy',
    8: 'Large_Customer_Base',
    9: 'Government_Support'
}

negative_cluster_names = {
    0: 'Limited_Tech_n_Infra',
    1: 'Limited_Financing',
    2: 'Tough_Labor_Market',
    3: 'Various_Downsides',
    4: 'Limited_Tech_n_Infra',
    5: 'Tough_Labor_Market',
    6: 'Low_Pop_Bad_Weather',
    7: 'Limited_Financing',
    8: 'Low_Pop_Bad_Weather',
    9: 'Unfavorable_Location'
}

In [83]:
get_cluster_sentences(0, 2, positive_matched_list, posi=True)

Positive Cluster 0:
The county has a strong business community, providing resources and support for entrepreneurs.
Autauga County is located in the heart of Alabama, providing easy access to major cities and transportation hubs.
Baldwin County has a strong economy with a low unemployment rate.
There are numerous resources available to help small businesses get started, such as the Small Business Development Center and the Baldwin County Economic Development Alliance.
Access to a large customer base due to the county's population of over 27,000 people.
Access to a large customer base due to the county's proximity to Birmingham and other major cities.
Access to a large customer base due to the county's population of over 57,000 people.
Access to resources such as the Blount County Chamber of Commerce, which provides support and resources to local businesses.
Access to resources such as the Small Business Development Center and the Butler County Chamber of Commerce
Access to a large custo