In [1]:
import pandas as pd
import json

# Load the JSON data from the file
with open('issues.json', 'r') as f:
    data = json.load(f)

# Convert the loaded data into a pandas DataFrame
df = pd.DataFrame(data)
df['created_at'] = pd.to_datetime(df['created_at'])
df['closed_at'] = pd.to_datetime(df['closed_at'])

# Calculate the time to close by subtracting 'created_at' from 'closed_at'
df['time_to_close'] = df['closed_at'] - df['created_at']
df['time_to_close_hours'] = df['time_to_close'].dt.total_seconds() / 3600
# drop created_at, closed_at, time_to_close
df.drop(columns=['time_to_close'], inplace=True)

In [2]:
df.head()

Unnamed: 0,title,body,created_at,closed_at,is_pull_request,author_association,time_to_close_hours
0,[DOM] Fix package.json files for #28784,Missed some files for the react-server disallo...,2024-04-08 22:41:51+00:00,2024-04-08 22:49:19+00:00,True,COLLABORATOR,0.124444
1,[DOM] disallow client entrypoints with react-s...,`react-server` precludes loading code that exp...,2024-04-08 22:26:02+00:00,2024-04-08 22:37:06+00:00,True,COLLABORATOR,0.184444
2,[TestUtils] Build limited test-utils,We landed a flag to disable test utils in many...,2024-04-08 18:02:46+00:00,2024-04-08 19:27:20+00:00,True,COLLABORATOR,1.409444
3,[Flight] Allow lazily resolving outlined models,We used to assume that outlined models are emi...,2024-04-08 15:24:01+00:00,2024-04-08 19:40:11+00:00,True,COLLABORATOR,4.269444
4,Add Promise as a child test to Flight fixture,Adds a test for promise as a child that was fi...,2024-04-08 10:46:31+00:00,2024-04-08 15:06:17+00:00,True,COLLABORATOR,4.329444


In [3]:
import nltk
import pandas as pd
import string
import contractions
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Fill NaN values with empty string before combining
df['title'].fillna('', inplace=True)
df['body'].fillna('', inplace=True)
df['text'] = df['title'] + " " + df['body']

# Step 1: Replace line breaks and quotation marks
df['text_parsed'] = df['text'].str.replace("\r", " ")
df['text_parsed'] = df['text_parsed'].str.replace("\n", " ")
df['text_parsed'] = df['text_parsed'].str.replace('"', '')
df['text_parsed'] = df['text_parsed'].str.lower()

# Step 2: Expand Contractions
df['text_parsed'] = df['text_parsed'].apply(lambda x: contractions.fix(x))

# Step 3: Remove punctuation and possessive pronoun terminations
punctuation_signs = string.punctuation
df['text_parsed'] = df['text_parsed'].apply(lambda x: ''.join([char for char in x if char not in punctuation_signs]))
df['text_parsed'] = df['text_parsed'].str.replace("'s", "", regex=True)

# Step 4: Lemmatize text
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    text_words = nltk.word_tokenize(text)
    lemmatized_list = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in text_words]
    lemmatized_text = " ".join(lemmatized_list)
    return lemmatized_text

df['text_parsed'] = df['text_parsed'].apply(lambda x: lemmatize_text(x))

# Remove stop words
stop_words = set(nltk.corpus.stopwords.words('english'))
df['text_parsed'] = df['text_parsed'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [4]:
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

input_ids = []
attention_masks = []

for text in df['text_parsed']:
    encoded_dict = tokenizer.encode_plus(
        text,                      
        add_special_tokens=True,   
        max_length=64,           
        pad_to_max_length=True,    
        return_attention_mask=True,
        return_tensors='pt',      
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Set the batch size.  
batch_size = 8

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to CPU
  batch = tuple(t for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  # Get the "pooled" output for each batch from BERT, which is a more 
  # fixed-sized representation for the whole sentence, and can be used for classification tasks.
  pooled_output = outputs[1]

  # Move logits and labels to CPU
  predictions.append(pooled_output)

print('    DONE.')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicting labels for 26,022 test sentences...
    DONE.


In [5]:
import pandas as pd
import numpy as np

# Assuming 'embeddings' is a list of pooled BERT outputs
embeddings = np.vstack(predictions)  # Stacking the list of tensors into a single numpy array

# Convert BERT embeddings to a DataFrame
bert_features = pd.DataFrame(embeddings)

# Assign column names to the BERT feature columns
bert_features.columns = [f'bert_{i}' for i in range(bert_features.shape[1])]

# Concatenate the BERT embeddings DataFrame with your original DataFrame
# Make sure the indices are aligned before concatenation
df_bert = pd.concat([df.reset_index(drop=True), bert_features.reset_index(drop=True)], axis=1)

# Encode the 'author_association' column with dummy variables
author_association_dummies = pd.get_dummies(df_bert['author_association'], prefix='author')
df_bert = pd.concat([df_bert, author_association_dummies], axis=1)

# Convert 'created_at' to datetime if it's not already
df_bert['created_at'] = pd.to_datetime(df_bert['created_at'])

# Ensure the data is sorted chronologically based on 'created_at'
df_bert = df_bert.sort_values('created_at')

# Split your data chronologically into train and test sets
# Let's say 80% for training and 20% for testing as an example
split_point = int(len(df_bert) * 0.8)
train_data = df_bert.iloc[:split_point, :]
test_data = df_bert.iloc[split_point:, :]

# Define the columns to drop (columns not used as features for training)
columns_to_drop = ['created_at', 'closed_at', 'title', 'body', 'author_association', 'text', 'text_parsed']

# Drop the unnecessary columns and split the data into features and target
X_train = train_data.drop(columns=columns_to_drop + ['time_to_close_hours'], axis=1)
y_train = train_data['time_to_close_hours']

X_test = test_data.drop(columns=columns_to_drop + ['time_to_close_hours'], axis=1)
y_test = test_data['time_to_close_hours']

# Handle any NaNs in target variable 'time_to_close' if needed
X_train = X_train[y_train.notnull()]
y_train = y_train[y_train.notnull()]

X_test = X_test[y_test.notnull()]
y_test = y_test[y_test.notnull()]

# Now, X_train, y_train, X_test, and y_test are ready for model training and evaluation

In [6]:
# Convert 'is_pull_request' from True/False to 0/1
X_train['is_pull_request'] = X_train['is_pull_request'].astype(int)
X_test['is_pull_request'] = X_test['is_pull_request'].astype(int)

author_columns = ['author_COLLABORATOR', 'author_CONTRIBUTOR', 'author_MEMBER', 'author_NONE']

# Convert each author_* column to numeric
for col in author_columns:
    X_train[col] = X_train[col].astype(int)
    X_test[col] = X_test[col].astype(int)

In [7]:
X_train.head()

Unnamed: 0,is_pull_request,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,...,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767,author_COLLABORATOR,author_CONTRIBUTOR,author_MEMBER,author_NONE
26021,1,-0.61071,-0.262707,-0.652588,0.281311,0.32559,-0.089598,0.310309,0.143253,-0.324933,...,-0.029425,0.183692,0.456515,-0.664369,-0.498581,0.64188,0,1,0,0
26020,1,-0.80995,-0.488104,-0.730921,0.690221,0.532908,-0.303889,0.700155,0.325999,-0.504637,...,0.064088,0.227013,0.643613,-0.576652,-0.761884,0.874527,0,1,0,0
26019,1,-0.839109,-0.572211,-0.866289,0.740512,0.602457,-0.244243,0.876909,0.322928,-0.505415,...,0.054007,0.182228,0.560197,-0.759059,-0.711372,0.879891,0,1,0,0
26018,1,-0.768151,-0.280106,-0.744873,0.470827,0.520267,-0.238235,0.469454,0.115835,-0.380922,...,-0.459416,0.149,0.508871,-0.821528,-0.568023,0.777994,0,1,0,0
26017,1,-0.848433,-0.353808,-0.355003,0.588401,0.336235,-0.249113,0.776377,0.229713,-0.148931,...,0.164339,0.444366,0.480367,-0.245089,-0.68228,0.90837,0,1,0,0


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

# Initialize the Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
y_train_log = np.log1p(y_train)

# Fit the model on the training data
rf_reg.fit(X_train, y_train_log)

# Predict on the training and testing sets
train_predictions_rf = np.expm1(rf_reg.predict(X_train))
test_predictions_rf = np.expm1(rf_reg.predict(X_test))

# Calculate RMSE for both the training and testing sets
train_rmse_rf = sqrt(mean_squared_error(y_train, train_predictions_rf))
test_rmse_rf = sqrt(mean_squared_error(y_test, test_predictions_rf))

print(f"Random Forest Training RMSE: {train_rmse_rf:.2f}")
print(f"Random Forest Test RMSE: {test_rmse_rf:.2f}")

Random Forest Training RMSE: 4024.56
Random Forest Test RMSE: 1637.83


In [10]:
from joblib import dump

model_path = "random_forest_model.joblib"
dump(rf_reg, model_path)

['random_forest_model.joblib']