In [1]:
# trained models, and vectorizer used are pickled for easy of loading
import pickle


In [2]:
# pandas used for reading input file, writing output, and easy manipulation of colums
import pandas as pd

In [3]:
import sys
sys.path.insert(1, '../common')
# utility modules for preprocessing of text
import nlp_utils as nu


In [4]:
# loads the model, and vectorizer from pickled files. all gets the preprocessor
# these are needed for inferencing of test data
def get_model():
    model_dict = {}

    sys.stdout.flush() 
    preprocessor = nu.process_text
    vectorizer = pickle.load(open('./TFIDF-2vectorizer.pkl', 'rb'))
    model = pickle.load(open('./SVM_TFIDF-2.pkl', 'rb'))
    model_dict['preprocessor'] = preprocessor
    model_dict['vectorizer'] = vectorizer
    model_dict['model'] = model
    return model_dict

In [6]:
# Let's read in the validated data
df = pd.read_csv('..\..\Gaussian Solutions Input Data\AI_ML_Challenge_Validation_Data_Set_v1.csv', encoding='utf-8')

In [7]:
# renaming the 'Clause Text' column to 'Clause' as the pickled model expects to see 'Clause'
df = df.rename(columns = {'Clause Text' : 'Clause'})

In [8]:
df.head()

Unnamed: 0,Clause ID,Clause
0,94,\tthe Customer does not make any admissions (s...
1,7028,Requests. Company will notify Customer before ...
2,9048,We sometimes release beta versions of our webs...
3,7755,Termination without Cause. Customer may termin...
4,1145,1.8 “Term” means the term of this Agreement as...


In [9]:
model_dict = get_model()

In [10]:
vectorizer = model_dict['vectorizer']
model = model_dict['model']
preprocessor = model_dict['preprocessor']

In [11]:
# preprocess the data. eliminate puctuations, stop_words, weird characters, and single letter tokens
print("Preprocessing data")
X_prep = [str(preprocessor(x)) for x in df['Clause'] ]

Preprocessing data


In [12]:
# convert the tokens into vectorized representation. 
print("Vecotirizng data")
X_test = vectorizer.transform(X_prep)

Vecotirizng data


In [13]:
# now we're ready to prediction.
# predict the labels
y_pred = model.predict(X_test)

In [14]:
# predict the probabilities for the model to return label = 1
try:
    y_prob = model.predict_proba(X_test)[:,1]
except AttributeError:
    y_prob = model.decision_function(X_test)    

In [14]:
# store the predictions in data frame
df['Prediction'] = y_pred

In [15]:
# store the probability of acceptable in data frame. By default model returns probability for label = 1, which would return probability of rejection. Need to subtract it from 1 to get probability of acceptance
df['Probability Acceptable'] = 1-y_prob

In [16]:
# preparing to write out the csv. drop 'Clause Text' from data frame, as submission requested not to include it
csv_df = df.drop(columns=['Clause'])

In [17]:
csv_df.head()

Unnamed: 0,Clause ID,Probability Acceptable
0,94,0.750429
1,7028,0.68154
2,9048,0.666098
3,7755,0.267153
4,1145,0.93706


In [18]:
# write to CSV file for submission
csv_df.to_csv('Gaussian Solutions Validation Data File. csv', index=False)