In [20]:
import pandas as pd
import json

# Load the JSON data from the file
with open('issues.json', 'r') as f:
    data = json.load(f)

# Convert the loaded data into a pandas DataFrame
df = pd.DataFrame(data)
df['created_at'] = pd.to_datetime(df['created_at'])
df['closed_at'] = pd.to_datetime(df['closed_at'])

# Calculate the time to close by subtracting 'created_at' from 'closed_at'
df['time_to_close'] = df['closed_at'] - df['created_at']
df['time_to_close_hours'] = df['time_to_close'].dt.total_seconds() / 3600
# drop created_at, closed_at, time_to_close
df.drop(columns=['time_to_close'], inplace=True)

In [21]:
import nltk
import pandas as pd
import string
import contractions
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Fill NaN values with empty string before combining
df['title'].fillna('', inplace=True)
df['body'].fillna('', inplace=True)
df['text'] = df['title'] + " " + df['body']

# Step 1: Replace line breaks and quotation marks
df['text_parsed'] = df['text'].str.replace("\r", " ")
df['text_parsed'] = df['text_parsed'].str.replace("\n", " ")
df['text_parsed'] = df['text_parsed'].str.replace('"', '')
df['text_parsed'] = df['text_parsed'].str.lower()

# Step 2: Expand Contractions
df['text_parsed'] = df['text_parsed'].apply(lambda x: contractions.fix(x))

# Step 3: Remove punctuation and possessive pronoun terminations
punctuation_signs = string.punctuation
df['text_parsed'] = df['text_parsed'].apply(lambda x: ''.join([char for char in x if char not in punctuation_signs]))
df['text_parsed'] = df['text_parsed'].str.replace("'s", "", regex=True)

# Step 4: Lemmatize text
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    text_words = nltk.word_tokenize(text)
    lemmatized_list = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in text_words]
    lemmatized_text = " ".join(lemmatized_list)
    return lemmatized_text

df['text_parsed'] = df['text_parsed'].apply(lambda x: lemmatize_text(x))

# Remove stop words
stop_words = set(nltk.corpus.stopwords.words('english'))
df['text_parsed'] = df['text_parsed'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['title'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['body'].fillna('', inplace=True)
[nltk_data] Downloading package punkt to /Users/wenxiyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopw

In [22]:
df.head()

Unnamed: 0,title,body,created_at,closed_at,is_pull_request,author_association,time_to_close_hours,text,text_parsed
0,[DOM] Fix package.json files for #28784,Missed some files for the react-server disallo...,2024-04-08 22:41:51+00:00,2024-04-08 22:49:19+00:00,True,COLLABORATOR,0.124444,[DOM] Fix package.json files for #28784 Missed...,dom fix packagejson file 28784 miss file react...
1,[DOM] disallow client entrypoints with react-s...,`react-server` precludes loading code that exp...,2024-04-08 22:26:02+00:00,2024-04-08 22:37:06+00:00,True,COLLABORATOR,0.184444,[DOM] disallow client entrypoints with react-s...,dom disallow client entrypoints reactserver co...
2,[TestUtils] Build limited test-utils,We landed a flag to disable test utils in many...,2024-04-08 18:02:46+00:00,2024-04-08 19:27:20+00:00,True,COLLABORATOR,1.409444,[TestUtils] Build limited test-utils We landed...,testutils build limit testutils land flag disa...
3,[Flight] Allow lazily resolving outlined models,We used to assume that outlined models are emi...,2024-04-08 15:24:01+00:00,2024-04-08 19:40:11+00:00,True,COLLABORATOR,4.269444,[Flight] Allow lazily resolving outlined model...,flight allow lazily resolve outline model use ...
4,Add Promise as a child test to Flight fixture,Adds a test for promise as a child that was fi...,2024-04-08 10:46:31+00:00,2024-04-08 15:06:17+00:00,True,COLLABORATOR,4.329444,Add Promise as a child test to Flight fixture ...,add promise child test flight fixture add test...


In [23]:
author_association_dummies = pd.get_dummies(df['author_association'], prefix='author')
df = pd.concat([df, author_association_dummies], axis=1)

df['created_at'] = pd.to_datetime(df['created_at'])

# Ensure the data is sorted chronologically based on 'created_at'
df = df.sort_values('created_at')
df = df.reset_index(drop=True)

# Split your data chronologically into train and test sets
# Let's say 80% for training and 20% for testing as an example
split_point = int(len(df) * 0.8)
train_data = df.iloc[:split_point, :]
test_data = df.iloc[split_point:, :]

# Define the columns to drop (columns not used as features for training)
columns_to_drop = ['created_at', 'closed_at', 'title', 'body', 'author_association', 'text']

# Drop the unnecessary columns and split the data into features and target
X_train = train_data.drop(columns=columns_to_drop , axis=1)
y_train = train_data['time_to_close_hours']

X_test = test_data.drop(columns=columns_to_drop, axis=1)
y_test = test_data['time_to_close_hours']

# Handle any NaNs in target variable 'time_to_close' if needed
X_train = X_train[y_train.notnull()]
y_train = y_train[y_train.notnull()]

X_test = X_test[y_test.notnull()]
y_test = y_test[y_test.notnull()]

X_train['is_pull_request'] = X_train['is_pull_request'].astype(int)
X_test['is_pull_request'] = X_test['is_pull_request'].astype(int)

author_columns = ['author_COLLABORATOR', 'author_CONTRIBUTOR', 'author_MEMBER', 'author_NONE']

# Convert each author_* column to numeric
for col in author_columns:
    X_train[col] = X_train[col].astype(int)
    X_test[col] = X_test[col].astype(int)

In [24]:
X_train.head()

Unnamed: 0,is_pull_request,time_to_close_hours,text_parsed,author_COLLABORATOR,author_CONTRIBUTOR,author_MEMBER,author_NONE
0,1,117.619167,run test iframe block initial launch feel free...,0,1,0,0
1,1,0.012778,docs fix button link bottom home button index ...,0,1,0,0
2,1,0.033333,docs fix couple minor typosspelling,0,1,0,0
3,1,0.004722,docs improve event handle documentation add ad...,0,1,0,0
4,1,0.014444,fix link root readmemd,0,1,0,0


In [25]:
from torch import nn
from transformers import BertModel
import torch

class BertRegressorWithFeatures(nn.Module):
    def __init__(self, additional_feature_size):
        super(BertRegressorWithFeatures, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Assuming the BERT output size and additional features. Adjust `additional_feature_size` accordingly.
        self.regressor = nn.Linear(self.bert.config.hidden_size + additional_feature_size, 1)

    def forward(self, input_ids, attention_mask, additional_features):
        # Get the pooled output from BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        # Concatenate the BERT output with the additional features
        combined_features = torch.cat((pooled_output, additional_features), dim=1)
        
        # Pass the combined features through the regressor for the final prediction
        return self.regressor(combined_features)

In [26]:
train_additional_features = X_train[['is_pull_request', 'author_COLLABORATOR', 'author_CONTRIBUTOR', 'author_MEMBER', 'author_NONE']].to_numpy()
test_additional_features = X_test[['is_pull_request', 'author_COLLABORATOR', 'author_CONTRIBUTOR', 'author_MEMBER', 'author_NONE']].to_numpy()
labels = df['time_to_close_hours'].to_numpy()

In [27]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize training and testing text data
train_encodings = tokenizer(X_train['text_parsed'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test['text_parsed'].tolist(), truncation=True, padding=True, max_length=512)


In [28]:
from torch.utils.data import Dataset, DataLoader
import torch

class GitHubIssuesDataset(Dataset):
    def __init__(self, encodings, additional_features, labels):
        self.encodings = encodings
        self.additional_features = additional_features
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['additional_features'] = torch.tensor(self.additional_features[idx], dtype=torch.float)
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GitHubIssuesDataset(train_encodings, train_additional_features, y_train)
test_dataset = GitHubIssuesDataset(test_encodings, test_additional_features, y_test)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [29]:
from transformers import AdamW
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertRegressorWithFeatures(additional_feature_size=5).to(device)  # Update `additional_feature_size` as necessary
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        additional_features = batch['additional_features'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask, additional_features)
        loss = nn.MSELoss()(outputs, labels.unsqueeze(-1))  # Ensure labels are correctly shaped
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()




In [None]:
from torch.nn.functional import mse_loss
from math import sqrt

def calculate_rmse(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    total_mse = 0
    with torch.no_grad():  # No gradients needed
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            additional_features = batch['additional_features'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask, additional_features)
            
            # Assuming your model returns the predictions directly
            # You may need to adjust this if your model output structure is different
            mse = mse_loss(outputs.squeeze(), labels)  # Ensure the shapes align
            total_mse += mse.item() * len(labels)  # Accumulate the total MSE

    # Calculate mean MSE then RMSE
    mean_mse = total_mse / len(data_loader.dataset)
    rmse = sqrt(mean_mse)
    return rmse

In [None]:
train_rmse = calculate_rmse(model, train_loader, device)
test_rmse = calculate_rmse(model, test_loader, device)

print(f"Training RMSE: {train_rmse}")
print(f"Testing RMSE: {test_rmse}")

In [None]:
torch.save(model.state_dict(), 'model_time_to_close.pth')