In [1]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from statistics import mean 
import numpy as np
import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate

Using TensorFlow backend.


In [2]:
# Preprocessing
def preprocess(row):
    text = row['content']
    text= text.lower()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = regex.sub(' ', text)
    meaningless_words = ['words', 'business wire', 'bwr english', 'copyright', 'businesswire.com',
                         'dow jones newswires','djdn english','dow jones institutional news', 
                         'all rights reserved', 'dow jones company inc', 'pr Newswire',
                         'prn english', 'the wall street journal',  'dow jones & company inc',
                         'j b4 english']
    for words in meaningless_words:
        text = text.replace(words, '')
    
    # picking paragraph containing keywords 
    text = text.split("/n/n")
    text = [para for para in text if "license" or 'licensing' in para]
    text = ''.join(text)
    
        
    # remove remaining tokens that are not alphabetic
    text = text.split(' ')
    text = [word for word in text if word.isalpha()]
    
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    stemmer = SnowballStemmer("english")
    lemmatizer = WordNetLemmatizer()
    stemmed_text = []
    for word in text:
        #stemmed_text.append(stemmer.stem(word))
        stemmed_text.append(lemmatizer.lemmatize(word))
        
    text = " ".join(stemmed_text)
    row['content'] = text
    return(row)
    
    
    
path = r'MASTER_TRAINING.xlsx'
licensing_df = pd.read_excel(path)

columns = ['content', 'licensing_agreement']
data = licensing_df[columns]
data = data.apply(preprocess, axis = 1)

X = data['content']
y = data['licensing_agreement']

# Converting to binary
#X = X.apply(lambda x:' '.join(f"{ord(i):08b}" for i in x))

vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(X).toarray()
#print(vectorizer.get_feature_names())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy_score(y_test,predictions)

from sklearn.model_selection import KFold 
kf = KFold(n_splits=10)
kf.get_n_splits(X)
print(kf) 
KFold(n_splits=10, random_state=None, shuffle=False)

accuracy=[]
precision = []
recall = []
for train_index, test_index in kf.split(X):
    #print("Train:", train_index, "Test:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    clf_report = classification_report(y_test, predictions, output_dict = True)
    accuracy.append(clf_report['accuracy'])
    precision.append(clf_report['weighted avg']['precision'])
    recall.append(clf_report['weighted avg']['recall'])
print("Accuracy: {}, Precision: {}, Recall: {}".format(mean(accuracy),mean(precision),mean(recall)))



KFold(n_splits=10, random_state=None, shuffle=False)
Accuracy: 0.7350840336134454, Precision: 0.7447690509521512, Recall: 0.7350840336134454


In [5]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression 
model_logistic = LogisticRegression()
model_logistic.fit(X_train, y_train)
predictions = model_logistic.predict(X_test)
clf_report = classification_report(y_test, predictions, output_dict = True)
print('Accuracy: ',clf_report['accuracy'])

Accuracy:  0.9064039408866995


In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dc = DecisionTreeClassifier()
dc_fit = dc.fit(X,y)
dc_fit.predict(X_test)
clf_report = classification_report(y_test, predictions, output_dict = True)
accuracy.append(accuracy_score(y_test,predictions))
print(mean(accuracy))

# Cross Validation
cv_adaboost = KFold(n_splits=10)
cv_adaboost.get_n_splits(X)
print(cv_adaboost) 
KFold(n_splits=10, random_state=None, shuffle=False)

accuracy=[]
precision = []
recall = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    clf_report = classification_report(y_test, predictions, output_dict = True)
    accuracy.append(clf_report['accuracy'])
    precision.append(clf_report['weighted avg']['precision'])
    recall.append(clf_report['weighted avg']['recall'])
print("Accuracy: {}, Precision: {}, Recall: {}".format(mean(accuracy),mean(precision),mean(recall)))

0.7672872597314788
KFold(n_splits=10, random_state=None, shuffle=False)


In [11]:
# Support Vector Classifier
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
clf_predictions = clf.predict(X_test)
# print("Accuracy: {}%".format(clf.score(X_test, y_test) * 100 ))

from sklearn.model_selection import KFold 
kf = KFold(n_splits=10)
kf.get_n_splits(X)
print(kf) 
KFold(n_splits=10, random_state=None, shuffle=False)

accuracy=[]
precision = []
recall = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    clf_report = classification_report(y_test, predictions, output_dict = True)
    accuracy.append(clf_report['accuracy'])
    precision.append(clf_report['weighted avg']['precision'])
    recall.append(clf_report['weighted avg']['recall'])
print("Accuracy: {}, Precision: {}, Recall: {}".format(mean(accuracy),mean(precision),mean(recall)))

KFold(n_splits=10, random_state=None, shuffle=False)
Accuracy: 0.8096517917511832, Precision: 0.8127980606879109, Recall: 0.8096517917511832


In [14]:
# AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
clf_adaboost=ab.fit(X,y)
clf.predict(X_test)
clf_report = classification_report(y_test, predictions, output_dict = True)
accuracy.append(accuracy_score(y_test,predictions))
print(mean(accuracy))

# Cross Validation
cv_adaboost = KFold(n_splits=10)
cv_adaboost.get_n_splits(X)
print(cv_adaboost) 
KFold(n_splits=10, random_state=None, shuffle=False)

accuracy=[]
precision = []
recall = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    clf_report = classification_report(y_test, predictions, output_dict = True)
    accuracy.append(clf_report['accuracy'])
    precision.append(clf_report['weighted avg']['precision'])
    recall.append(clf_report['weighted avg']['recall'])
print("Accuracy: {}, Precision: {}, Recall: {}".format(mean(accuracy),mean(precision),mean(recall)))

0.8188952697066287
KFold(n_splits=10, random_state=None, shuffle=False)
Accuracy: 0.8096517917511832, Precision: 0.8127980606879109, Recall: 0.8096517917511832
