In [1]:
# import all the libraries
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
import polars as pl

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import polars as pl

import xgboost as xgb
from tqdm import tqdm

In [2]:
# read the data
data = pd.read_csv("../data/pullreq_with_code.csv")

In [18]:
# drop the rows with no code
data = data[data["added_code"].astype(str) != "None"]

In [8]:
# only get the merged data, and drop any data that have n/a as gender
rejected_data = data.loc[data['merged_or_not'] == 0]
rejected_data = rejected_data.loc[rejected_data['contrib_gender'].notna()]

# drop the columns that are not needed
rejected_data = rejected_data.drop(['ownername', 'reponame', 'id', 'project_id', 'github_id', 'creator_id'], axis=1)

In [6]:
# use a label encoder to encode gender as 0 or 1
le = LabelEncoder()
le.fit(rejected_data['contrib_gender'])
rejected_data['contrib_gender'] = le.transform(rejected_data['contrib_gender'])

# sample the data to have equal number
rejected_data_0 = rejected_data[rejected_data['contrib_gender'] == 0].sample(800)
rejected_data_1 = rejected_data[rejected_data['contrib_gender'] == 1].sample(800)

# concatenate the two dataframes
rejected_data = pd.concat([rejected_data_0, rejected_data_1])

# shuffle the data
rejected_data = rejected_data.sample(frac=1).reset_index(drop=True)

# get the x and y, x is the code, y is the gender
X = rejected_data['added_code']
Y = rejected_data['contrib_gender']

In [41]:
# prepare the tokenizer and model for codebert
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
codebert_model = AutoModel.from_pretrained("microsoft/codebert-base")

In [42]:
# get the embeddings for each code
X_embeddings = []
for code in tqdm(X):
    inputs = tokenizer(code, return_tensors="pt", truncation=True, padding=True)
    outputs = codebert_model(**inputs) # inputs = input_ids, attention_mask
    X_embeddings.append(outputs.last_hidden_state[:, 0, :].detach().numpy())

100%|██████████| 1600/1600 [03:47<00:00,  7.02it/s]


In [43]:
# reshape the embeddings
X_embeddings = np.array(X_embeddings)
X_embeddings = X_embeddings.reshape(-1, codebert_model.config.hidden_size)

In [44]:
# split into train test val split
train_ratio = 0.70
test_ratio = 0.20
val_ratio = 0.10

X_train, X_test, y_train, y_test = train_test_split(X_embeddings, Y, test_size=1-train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(val_ratio+test_ratio))

In [45]:
# convert to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [46]:
# reshape the data
X_train = X_train.reshape(-1, codebert_model.config.hidden_size)
X_val = X_val.reshape(-1, codebert_model.config.hidden_size)
X_test = X_test.reshape(-1, codebert_model.config.hidden_size)

In [47]:
# create a neural network model that takens in the code embeddings
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [50]:
# create the model
model = Net(input_size=codebert_model.config.hidden_size, hidden_size=1000, num_classes=2)

# create the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

# train the model
for epoch in range(20):
    for i, (codes, labels) in enumerate(zip(X_train, y_train)):
        # get the code embeddings and labels
        codes = torch.tensor(codes).unsqueeze(0)
        labels = torch.tensor([labels]).long()
        
        # run gradient descent on the model
        outputs = model(codes)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # evaluate the model on the validation set
    correct = 0
    total = 0
    with torch.no_grad():
        for i, (codes, labels) in enumerate(zip(X_test, y_test)):
            
            codes = torch.tensor(codes).unsqueeze(0)
            labels = torch.tensor([labels]).long()
            
            outputs = model(codes)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    # print the accuracy
    accuracy = 100 * correct / total
    print ('Epoch [{}/{}], Train Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(epoch+1, 100, loss.item(), accuracy))

Epoch [1/100], Train Loss: 0.4756, Test Accuracy: 51.09%
Epoch [2/100], Train Loss: 0.5119, Test Accuracy: 51.09%
Epoch [3/100], Train Loss: 0.5110, Test Accuracy: 51.09%
Epoch [4/100], Train Loss: 0.5107, Test Accuracy: 51.09%
Epoch [5/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [6/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [7/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [8/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [9/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [10/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [11/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [12/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [13/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [14/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [15/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [16/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [17/100], Train Loss: 0.5106, Test Accuracy: 51.09%
Epoch [18/100], Train L

In [51]:
# evaluate the model on the validation set
y_true = []
y_pred = []
with torch.no_grad():
    correct = 0
    total = 0
    for i, (codes, labels) in enumerate(zip(X_val, y_val)):
        codes = torch.tensor(codes).unsqueeze(0)
        labels = torch.tensor([labels]).long()
        
        outputs = model(codes)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        y_true.extend(labels.tolist())
        y_pred.extend(predicted.tolist())

    accuracy = 100 * correct / total
    print ('Validation Accuracy: {:.2f}%'.format(accuracy))

Validation Accuracy: 48.12%


In [52]:
# evaluate the model on the test set
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1_score, _ = precision_recall_fscore_support(np.array(y_true), np.array(y_pred))
print(precision)
print(recall)
print(f1_score)

[0.      0.48125]
[0. 1.]
[0.         0.64978903]


  _warn_prf(average, modifier, msg_start, len(result))
