In [21]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

import pickle
import geopandas as gpd

In [None]:
with open("geodata.pickle", "rb") as f:
    tweets : pd.DataFrame = pickle.load(f)

tweets = gpd.GeoDataFrame(
    tweets, geometry=gpd.points_from_xy(tweets["long"], tweets["lat"], crs="EPSG:4369") # ESPG:4369 is WGS84
).to_crs("EPSG:4269") # ESPG:4269 is NAD83

states = gpd.read_file("cb_2018_us_state_20m/cb_2018_us_state_20m.shp", crs="EPSG:4269")
states = states[states["STATEFP"].astype(int) < 57] # Take all states including district of Columbia
states = states.drop(columns=["STATENS", "AFFGEOID", "GEOID", "LSAD", "ALAND", "AWATER", "NAME", "STATEFP"])
states = states.rename(columns={"STUSPS" : "state"})
states = states.reset_index(drop=True)

tweets = gpd.clip(tweets, states, True)
tweets = gpd.tools.sjoin(tweets, states, how="left")

tweets = tweets.drop(columns=["index_right"])
#Ignore CRS conversion errors, EPSG:4269 is NAD83 


In [None]:
EMBED_DIM = len(tweets.iloc[0]['embed'])
NUM_CLASSES = len(states)
HIDDEN_SIZE = (EMBED_DIM)//2 + NUM_CLASSES
BATCH_SIZE = 16
LR = 1e-5            # Learning rate
EPOCHS = 100            # Number of training epochs

state_to_id = {}
for idx, row in states.iterrows():
    state_to_id[row["state"]] = idx

id_to_state = {idx: state for state, idx in state_to_id.items()}

class TweetEmbeddingDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = self.df.iloc[idx]['embed']
        y = state_to_id[self.df.iloc[idx]['state']]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.long)
    
class StatePredictorMLP(nn.Module):
    def __init__(self):
        super(StatePredictorMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(EMBED_DIM, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Dropout(p=0.2),             # Dropout for regularization
            nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(HIDDEN_SIZE, NUM_CLASSES)
        )

    def forward(self, x):
        return self.net(x)

EMBED_DIM

In [58]:
def train(model, dataloader, criterion, optimizer, device='cpu'):
    model.train()
    total_loss = 0.0
    correct = 0
    total_samples = 0

    for batch_embeddings, batch_labels in dataloader:
        batch_embeddings, batch_labels = batch_embeddings.to(device), batch_labels.to(device)

        outputs = model(batch_embeddings)
        loss = criterion(outputs, batch_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_embeddings.size(0)

        _, predicted = torch.max(outputs, dim=1)
        correct += (predicted == batch_labels).sum().item()
        total_samples += batch_labels.size(0)

    avg_loss = total_loss / total_samples
    accuracy = correct / total_samples

    return avg_loss, accuracy

In [59]:
def evaluate(model, dataloader, criterion, device='cpu'):
    model.eval()
    total_loss = 0.0
    correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch_embeddings, batch_labels in dataloader:
            batch_embeddings, batch_labels = batch_embeddings.to(device), batch_labels.to(device)
            
            outputs = model(batch_embeddings)
            loss = criterion(outputs, batch_labels)
            
            total_loss += loss.item() * batch_embeddings.size(0)

            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == batch_labels).sum().item()
            total_samples += batch_labels.size(0)

    avg_loss = total_loss / total_samples
    accuracy = correct / total_samples

    return avg_loss, accuracy

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
tweets_train = tweets.sample(frac=0.7, random_state = 34)
tweets_val = tweets.drop(tweets_train.index)

train_dataset = TweetEmbeddingDataset(tweets_train)
val_dataset = TweetEmbeddingDataset(tweets_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = StatePredictorMLP().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

df = pd.DataFrame(columns=["epoch", "train_loss", "train_acc", "val_loss", "val_acc"])
for epoch in range(EPOCHS):
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device=device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device=device)

        df.loc[len(df)] = [epoch, train_loss, train_acc, val_loss, val_acc]
        print(f"Epoch [{epoch+1}/{EPOCHS}] "
              f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} "
              f"| Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
        
df.to_csv(f"./results/{LR}.csv")

model.eval()