# DO NOT RERUN TAKES A LONG TIME

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [0]:
df=pd.read_csv("drive/My Drive/cs109bproject/df_project_unique.csv")

In [0]:
df=df.drop(labels=['Unnamed: 0'],axis=1)

# NLP Prediction of Budget and Schedule Changes 


## a) Preparing Our Dataset

### Creation of the input X
The input will be the concatenation of "Project Name","Category","Borough" and "Description" 

In [0]:
#my columns 
#'Total Budget Changes (Scaled to Original)'
#'Total Schedule Changes (Scaled to Original)'
df['Total Budget Changes (Scaled to Original)']=df['Total Budget Changes (Scaled to Original)'].fillna(0)
df['Total Schedule Changes (Scaled to Original)']=df['Total Schedule Changes (Scaled to Original)'].fillna(0)

df["Project Name"]=df["Project Name"].fillna('')
df["Category"]=df["Category"].fillna('')
df["Borough"]=df["Borough"].fillna('')
df["Description"]=df["Description"].fillna('')

df["text"]=df["Project Name"] + ', ' + df["Category"] + ' in ' + df["Borough"] + ': ' + df["Description"]

In [7]:
df["text"].iloc[np.random.randint(100)]

'Brunswick Avenue Sewers and Watermain Construction, Sewers in Queens: Storm and Sanitary sewers in Brunswick Ave. b/t Doughty Blvd. and Nameoke Ave, etc.Queens.  The storm sewer are needed to alleyiate ponding condition in the area and provide resisdents with adequate storm sewer.  New sanitary sewers have been included to service adjoining houses. There is water main work assciated with server project.'

### Creation of the labels Y

For Budget And Schedule changes, we will divide into 2 Categories :
- "Not Late/Reasonably Late" or "Too Late" : threshold set at 10%
- "Under Budget/Reasonably Over Budget" or "Over Budget" : threshold set at 5%

In [8]:
try :
    df=df.drop(index=df[df["Total Budget Changes (Scaled to Original)"]==np.inf].index)
except :
    print("Already done")
df[["Total Budget Changes (Scaled to Original)",'Total Schedule Changes (Scaled to Original)']].describe()

Unnamed: 0,Total Budget Changes (Scaled to Original),Total Schedule Changes (Scaled to Original)
count,313.0,313.0
mean,0.621585,-0.116365
std,4.365947,4.430884
min,-1.0,-64.913043
25%,-0.000645,0.0
50%,0.026811,0.109594
75%,0.324352,0.378151
max,59.592113,2.886424


In [0]:
def create_label(x,level):
    if x<=level:
        return 0
    return 1

df['labels_Schedule']=df["Total Schedule Changes (Scaled to Original)"].map(lambda x : create_label(x,0.1))
df['labels_Budget']=df["Total Budget Changes (Scaled to Original)"].map(lambda x : create_label(x,0.05))
#5 per cent Budget change is acceptable
#10 per cent Schedule change could be acceptable 
#arbitrary thresholds


## b) Preprocessing Dataset Using Tokenisers

Set max_length to 240 as we see that the input never has an length of more then 240. We are constrained by the max_length of Bert that is 520 and we do not want too many long input for no reason.

In [0]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification,BertTokenizer
#from transformers import XLNetTokenizer, TFXLNetForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#12-layer, 768-hidden, 12-heads, 110M parameters.
#Trained on lower-cased English text.

In [0]:
max_length = 240
batch_size=6

### Convert text to Token

In [0]:
def convert_text_to_feature(text):
  
  # combine step for tokenization, WordPiece vector mapping and will add also special tokens and truncate reviews longer than our max length
  
  return tokenizer.encode_plus(text, 
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )
  
  # map to the expected input to TFBertForSequenceClassification, see here 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks}, label

def encode_text(df,column_name='labels_Schedule', limit=-1):

  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []

  if (limit > 0):
      ds = ds.take(limit)

  for text, label in zip(df.text, df[column_name]):

    bert_input = convert_text_to_feature(text)

    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


### Split Train and Test

In [0]:
df_train,df_test=train_test_split(df,test_size=0.1)

# train dataset
ds_train_encoded = encode_text(df_train,column_name='labels_Schedule').shuffle(10000).batch(batch_size)

# test dataset
ds_test_encoded = encode_text(df_test,column_name='labels_Schedule').shuffle(10000).batch(batch_size)


## c) Fine-Tuning of our model

In [0]:
#initialize model and weights
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 3


# classifier Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [15]:
bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded) #with2e-5

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [40]:
model.save('Bert_labels_Schedule')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: Bert_labels_Schedule/assets


In [0]:
#from tensorflow import keras
#model1 = keras.models.load_model('Bert_labels_Schedule')

In [0]:
!zip -r Bert_labels_Schedule.zip Bert_labels_Schedule

## d) Results

In [24]:
df_encoded=encode_text(df,column_name='labels_Schedule').batch(batch_size)
preds=model.predict(df_encoded,verbose=1)



In [0]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    score=[]
    for k in x :
      e_k = np.exp(k - np.max(k))
      e_k=e_k / e_k.sum()
      score.append(e_k.tolist())
    return np.array(score)

scores=softmax(preds[0])

In [0]:
scores=scores.argmax(axis=1)


In [0]:
from sklearn.metrics import confusion_matrix

In [39]:
confusion_matrix(df['labels_Schedule'].values,scores)

array([[127,  26],
       [ 20, 140]])