In [5]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder

In [6]:
#Loading data from local storage
df = pd.read_csv(('data/text.csv'), index_col=False).sample(frac=1)
df.head(5)

Unnamed: 0,sentence,label
154,""" If Google do not secure the content writer s...",Risk
190,""" If customer is forced to keep Microsoft out ...",Risk
114,""" If Google deploy the project late due to our...",Risk
138,""" If they is forced to keep Microsoft out of t...",Risk
67,""" Drew White decide to include an additional s...",Decision


In [7]:
#Splitting data into train and test data
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['sentence'],df['label'],test_size=0.3)

In [8]:
#Encoding the labels Decision, Action and Risk as 0, 1 and 2 numeric values respectively. 
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
print(Train_Y[0:5])

[0 0 2 2 0]


In [9]:
#The Tfidf_vectorizer converts the dataset of sentences into vectorized inputs.
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['sentence'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
print(Tfidf_vect.vocabulary_)

{'if': 98, 'google': 86, 'do': 57, 'not': 146, 'secure': 179, 'the': 203, 'content': 40, 'writer': 227, 'spot': 188, 'during': 62, 'event': 71, 'due': 61, 'to': 209, 'lack': 114, 'of': 147, 'commitment': 34, 'from': 83, 'board': 22, 'then': 204, 'might': 130, 'need': 143, 'pay': 158, 'customers': 45, 'customer': 44, 'is': 107, 'forced': 80, 'keep': 113, 'microsoft': 129, 'out': 153, 'ai': 9, 'ecosystem': 64, 'our': 152, 'limited': 117, 'resources': 174, 'may': 126, 'loose': 120, 'supportive': 195, 'advantage': 6, 'deploy': 55, 'project': 169, 'late': 115, 'perfectionism': 159, 'could': 43, 'they': 206, 'hiring': 96, 'process': 167, 'can': 26, 'drew': 60, 'white': 220, 'decide': 49, 'include': 104, 'an': 10, 'additional': 5, 'set': 180, 'spare': 186, 'parts': 157, 'next': 145, 'system': 197, 'test': 201, 'we': 217, 'fear': 74, 'potential': 164, 'loss': 121, 'face': 72, 'higher': 94, 'prices': 166, 'purchase': 171, 'dan': 46, 'greenspan': 87, 'has': 91, 'be': 17, 'reminded': 173, 'mitiga

In [10]:
#SVM classifier is used as classification model
#C is Regularization parameter that is multipled by sqaured L2 penalty and the result added to the cost function. Used default value here.
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(Train_X_Tfidf,Train_Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [11]:
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [12]:
confusion_matrix(Test_Y, predictions_SVM)

array([[ 9,  0,  0],
       [ 0, 17,  0],
       [ 0,  0, 35]], dtype=int64)

In [13]:
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  100.0
