In [None]:
# Import necessary libraries  

import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [None]:
# Importing Dataset

df = pd.read_excel(r'D:\Project\Legal Documents\Dataset\ToS\dataset.xlsx')
print(df.head)

<bound method NDFrame.head of                                                Sentence  Unnamed: 1  Fairness
0                   * accepting the terms of service \n         NaN        -1
1               the purpose of this website , 9gag . \n         NaN        -1
2     com -lrb- the `` site '' -rrb- , owned and ope...         NaN        -1
3     please read these terms of service -lrb- `` ag...         NaN        -1
4     by using or accessing the services , you agree...         NaN         1
...                                                 ...         ...       ...
9409  you agree that given the unique and irreplacea...         NaN        -1
9410  therefore , for disputes that are not required...         NaN        -1
9411  you agree to limit your claims to claims for m...         NaN        -1
9412  and , you agree not to seek injunctive or equi...         NaN        -1
9413  we are not liable for any changes or problems ...         NaN        -1

[9414 rows x 3 columns]>


In [None]:
# Dropping the first column

df.drop(columns=['Unnamed: 1'], inplace=True)
print(df.head())

                                            Sentence  Fairness
0                * accepting the terms of service \n        -1
1            the purpose of this website , 9gag . \n        -1
2  com -lrb- the `` site '' -rrb- , owned and ope...        -1
3  please read these terms of service -lrb- `` ag...        -1
4  by using or accessing the services , you agree...         1


In [None]:
# Converting the Fairness values into '0' and '1'

for i in range(df.shape[0]-1):
    if df['Fairness'][i] == -1:
        df.iloc[i, 1] = 0
        
    else:
        df.iloc[i, 1] = 1
        
print(df.head(10))

                                            Sentence  Fairness
0                * accepting the terms of service \n         0
1            the purpose of this website , 9gag . \n         0
2  com -lrb- the `` site '' -rrb- , owned and ope...         0
3  please read these terms of service -lrb- `` ag...         0
4  by using or accessing the services , you agree...         1
5  if you do not agree to all the terms and condi...         0
6  the services are accessed by you -lrb- `` subs...         0
7  subject to the terms and conditions of this ag...         0
8  services shall include , but not be limited to...         0
9  9gag , inc may change , suspend or discontinue...         1


In [None]:
# Splitting the Dataset into Sentences and Fairness

X = df['Sentence']
Y = df['Fairness']

In [None]:
# Splitting the Dataset into Training set and Testing set

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
# Comparing the shape of the splits

print(df.shape)
print(X_train.shape)
print(X_test.shape)

print(df.shape)
print(Y_train.shape)
print(Y_test.shape)

(9414, 2)
(7531,)
(1883,)
(9414, 2)
(7531,)
(1883,)


In [None]:
# Converting text data into features

tfidf = TfidfVectorizer()
X_train_features = tfidf.fit_transform(X_train)
X_test_features = tfidf.transform(X_test)

In [None]:
# Training Support Vector Machine for Classification

from sklearn.svm import SVC
model = SVC()
model.fit(X_train_features, Y_train)

In [None]:
# Calculating Performance Matrix of the model

prediction = model.predict(X_test_features)

print("Accuracy:", accuracy_score(Y_test, prediction))
print("Precision:", precision_score(Y_test, prediction)) 
print("Recall:", recall_score(Y_test, prediction)) 
print("F1:", f1_score(Y_test, prediction))

print("\n")

print(confusion_matrix(Y_test, prediction))

print(classification_report(Y_test, prediction))

Accuracy: 0.9553903345724907
Precision: 0.9435483870967742
Recall: 0.6030927835051546
F1: 0.7358490566037736


[[1682    7]
 [  77  117]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1689
           1       0.94      0.60      0.74       194

    accuracy                           0.96      1883
   macro avg       0.95      0.80      0.86      1883
weighted avg       0.95      0.96      0.95      1883



In [None]:
# Saving the model

import pickle

new_directory = 'D:\Project\Legal Documents\Programs\models'
os.chdir(new_directory)

with open('fairness_check_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
# Saving the tokenizer

new_directory = r'D:\Project\Legal Documents\Programs\tokenizers'
os.chdir(new_directory)

with open('fairness_check_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

In [None]:
# Creating and Traning the pipeline

pipe = Pipeline([('vectorizer', tfidf), ('svm', model)])
pipe.fit(X_train, Y_train)

In [None]:
# Calculating Performance Matrix of the pipeline

prediction_2 = pipe.predict(X_test)

print("Accuracy:", accuracy_score(Y_test, prediction_2))
print("Precision:", precision_score(Y_test, prediction_2)) 
print("Recall:", recall_score(Y_test, prediction_2))
print("F1:", f1_score(Y_test, prediction))

print("\n")

print(confusion_matrix(Y_test, prediction))

print(classification_report(Y_test, prediction))

Accuracy: 0.9553903345724907
Precision: 0.9435483870967742
Recall: 0.6030927835051546
F1: 0.7358490566037736


[[1682    7]
 [  77  117]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1689
           1       0.94      0.60      0.74       194

    accuracy                           0.96      1883
   macro avg       0.95      0.80      0.86      1883
weighted avg       0.95      0.96      0.95      1883



In [None]:
# Saving the pipeline

new_directory = 'D:\Project\Legal Documents\Programs\pipelines'
os.chdir(new_directory)

with open('fairness_check_pipeline.pkl', 'wb') as f:
    pickle.dump(pipe, f)