In [30]:
# Importing necessary libraries

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.adapt import MLkNN
from sklearn.metrics import hamming_loss, accuracy_score
from sklearn.pipeline import Pipeline
import pickle

In [31]:
# Importing Dataset

df = pd.read_excel(r'D:\Project\legalysis\legalysis\Dataset\ToS\dataset_2.xlsx')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Sentence,Arbitration,Unilateral,Content_removal,Jurisdiction,Choice_of_law,Limitation_of_liability,Unilateral_termination,Contract_by_using
0,,* accepting the terms of service \n,0,0,0,0,0,0,0,0
1,,"the purpose of this website , 9gag . \n",0,0,0,0,0,0,0,0
2,,"com -lrb- the `` site '' -rrb- , owned and ope...",0,0,0,0,0,0,0,0
3,,please read these terms of service -lrb- `` ag...,0,0,0,0,0,0,0,0
4,,"by using or accessing the services , you agree...",0,0,0,0,0,0,0,1


In [32]:
# Dropping the column

df = df.drop('Unnamed: 0', axis=1)
df.head(5)

Unnamed: 0,Sentence,Arbitration,Unilateral,Content_removal,Jurisdiction,Choice_of_law,Limitation_of_liability,Unilateral_termination,Contract_by_using
0,* accepting the terms of service \n,0,0,0,0,0,0,0,0
1,"the purpose of this website , 9gag . \n",0,0,0,0,0,0,0,0
2,"com -lrb- the `` site '' -rrb- , owned and ope...",0,0,0,0,0,0,0,0
3,please read these terms of service -lrb- `` ag...,0,0,0,0,0,0,0,0
4,"by using or accessing the services , you agree...",0,0,0,0,0,0,0,1


In [33]:
# Splitting the Dataset into Sentences and an Array of Values of Unfairness

X = df["Sentence"]
y = np.asarray(df[df.columns[1:]])

In [34]:
# Slitting the Dataset into Train set and Testing set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [35]:
# Converting the text values into feature vectores

vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [36]:
# Checking the values in Dataset

print(X_train_tfidf)

print(y_train)

  (0, 6653)	0.1664721478562663
  (0, 6353)	0.14377885161916878
  (0, 6258)	0.20069139027222932
  (0, 6117)	0.04570878852557127
  (0, 6099)	0.11901140433681845
  (0, 6035)	0.2516919168989188
  (0, 6012)	0.08353782710683363
  (0, 5806)	0.3078187249299754
  (0, 5488)	0.14492824320705144
  (0, 5115)	0.2301399927043174
  (0, 5114)	0.2215025719173067
  (0, 4900)	0.1997674961902873
  (0, 4834)	0.18388938033724123
  (0, 4766)	0.20364697357968253
  (0, 4764)	0.2163429074002199
  (0, 4681)	0.20692559371164562
  (0, 4467)	0.16051046483127498
  (0, 4310)	0.11546133892218427
  (0, 4281)	0.04968111320069707
  (0, 4209)	0.20130514351453613
  (0, 3252)	0.2301399927043174
  (0, 3137)	0.12710592656883524
  (0, 2698)	0.15860352636213623
  (0, 2671)	0.0706276222922924
  (0, 2171)	0.16997963378067432
  :	:
  (6588, 4308)	0.07047664492063421
  (6588, 4281)	0.08426347344042463
  (6588, 4237)	0.17019499508405353
  (6588, 4222)	0.21281746978182547
  (6588, 4209)	0.1707154844279937
  (6588, 4132)	0.144437801973

In [37]:
# Training model for multi-label classification using Logistic Regression

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

model = MultiOutputClassifier(SVC()).fit(X_train_tfidf, y_train)

In [38]:
# Calculating the Performance Matrix of model

prediction = model.predict(X_test_tfidf)

print('Accuracy Score: ', accuracy_score(y_test, prediction))
print('Hamming Loss: ', round(hamming_loss(y_test, prediction), 2))

Accuracy Score:  0.9334513274336284
Hamming Loss:  0.01


In [39]:
# Creating and Training the Pipeline

pipe = Pipeline([('vectorizer', vectorizer), ('multi_output_classifier', model)])
pipe.fit(X_train, y_train)

In [40]:
# Calculating the Performance Matrix of Pipeline

prediction_2 = pipe.predict(X_test)

print('Accuracy Score: ', accuracy_score(y_test, prediction_2))
print('Hamming Loss: ', round(hamming_loss(y_test, prediction_2), 2))

Accuracy Score:  0.9338053097345133
Hamming Loss:  0.01


In [41]:
# Saving the model

new_directory = r'D:\Project\MiniProjectLegalysis\backend\model\models'
os.chdir(new_directory)

with open('unfairness_details_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [42]:
# Saving the Vectorizer

new_directory = r'D:\Project\MiniProjectLegalysis\backend\model\tokenizers'
os.chdir(new_directory)

with open('unfairness_details_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [43]:
# Saving the Pipeline

new_directory = r'D:\Project\MiniProjectLegalysis\backend\model\pipelines'
os.chdir(new_directory)

with open('unfairness_details_pipeline.pkl', 'wb') as f:
    pickle.dump(pipe,f)