In [None]:
#!pip install flair
#!pip install tensorflow
#!pip install keras
#!pip install imbalanced-learn
#!pip install spacy
#!pip install contractions
#!pip install seaborn
#!pip install keras

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import itertools
import os

# Natural Language Processing (NLP) libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

import contractions
import string
PUNC = string.punctuation


# Machine Learning libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras import layers

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Text embedding libraries
import flair
from flair.data import Sentence
from flair.embeddings import WordEmbeddings

# SpaCy for advanced NLP
import spacy
try:
    nlp = spacy.load("en_core_web_md")
except OSError:
    import spacy.cli
    print("Model not found. Downloading.")
    spacy.cli.download("en_core_web_md")
    import en_core_web_md
    nlp = en_core_web_md.load();

In [None]:
# load the data
data = pd.read_csv(os.path.join("..", "data", "final_data.csv"))
data.drop("Unnamed: 0", axis=1, inplace =True)
data.head();

## 1. Data Exploratory

In [None]:
data.info()

In [None]:
# check for missing values
for col in data.columns:
    print(col, data[col].isnull().sum())

In [None]:
# check the unique target labels
data.label.unique()

In [None]:
# Number of Datapoints per Class Category (1 or 0)
plt.figure(figsize=(4,4)) # figure size
ax=data['label'].value_counts(normalize=True).sort_values(ascending =False).plot(kind='bar',color = 'blue') #figure variables
plt.ylabel('Proportion', fontsize = 12, weight = 'bold') # set y_label
plt.xlabel('Label', fontsize = 12, weight = 'bold') # set y_label
plt.xticks(rotation=0, ha='center')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12);
plt.title('Class Distribution', fontsize = 14, fontweight = "bold"); # set the figure title

In [None]:
data['topic_ps'] = data['topic_ps'].replace(['Planning '], 'Planning')
data['topic_ps'] = data['topic_ps'].replace(['Sales & Growth '], 'Sales & Growth')
data['topic_ps'] = data['topic_ps'].replace(['Leadership & Strategy '], 'Leadership & Strategy')

In [None]:
# Number of leaders per Class Category
plt.figure(figsize=(5,4)) # figure size
ax=data['topic_ps'].value_counts(normalize = True).sort_values(ascending =True).plot(kind='barh',color = 'mediumblue') #figure variables
plt.ylabel('Topic', weight = 'bold', fontsize = 12) # set y_label
plt.xlabel('Proportion', weight = 'bold',  fontsize = 12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12); 

plt.title('Topics of Problem Statements', fontsize = 12, fontweight ='bold')

## 2. Text Preprocessing

### Tokenization

In [None]:
def tokenisation(word):
    return word_tokenize(word.lower())

### Expanding Contractions

In [None]:
def exp_contractions(word):
    return list(map(lambda w: contractions.fix(w),word))

### Removing Punctuations

In [None]:
def rm_puncuations(word):
    return [w for w in word if w not in PUNC]

### Removing Stop Words

In [None]:
def stopwords_rm(word):
    return [w for w in word if w not in STOP_WORDS]


### Lemmitatization

In [None]:
def lemmitize(word):
    return [token.lemma_ for token  in nlp(' '.join(word))]

### Pipeline with all the functions

In [None]:
def pipeline(word):
    return ' '.join(lemmitize(stopwords_rm(rm_puncuations(exp_contractions(tokenisation(word))))))

In [None]:
cols = ['content_ps','content_re']
for col in cols:
        data[col] = data[col].apply(pipeline)

In [None]:
data[['content_ps', 'content_re']].head(5);

## 3. Text Representation and Feature Engineering

In [None]:
# Inititiate the Glove model
glove_embedding = WordEmbeddings('glove')

In [None]:
# Embed words in sentence
def GloVe_embedding(s):
    sentence = Sentence(s)
    glove_embedding.embed(sentence)
    sentence_matrix = sum([np.matrix(token.embedding) for token in sentence])/len(sentence)
    return np.array(sentence_matrix).ravel()

In [None]:
for col in cols:
    data[col] = data[col].apply(GloVe_embedding)
    ps = pd.DataFrame(data['content_ps'].to_list())
    re = pd.DataFrame(data['content_re'].to_list())
    X = pd.concat([ps,re],axis=1)
    y = data['label']

In [None]:
# inspect the final dataframe with vectors
data[['content_ps', 'content_re']].head()

## 4. Machine Learning Models Training

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,train_size=0.8)


In [None]:
print(f'''Shape of X before SMOTE: {X_train.shape}''')

In [None]:
# SMOTE to fix class imbalance
# SMOTE generates new observations based on already existing observations
# it creates new instances using interpolation between the positive instances
sm = SMOTE(random_state=42)
x_train =np.array(X_train)
# fitting the data to the SMOTE model
x_train, y_train = sm.fit_resample(x_train, y_train.ravel())
print(f'''Shape of X before SMOTE: {X_train.shape}''')
print(f'''Shape of X after SMOTE: {x_train.shape}''')

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix', cmap=plt.cm.YlOrRd):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print(cm)
    plt.imshow(cm, interpolation='nearest', cmap='PuBu')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center", fontweight = "semibold",
                 color="black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Model I: Logistic Regression

In [None]:
model_1 = LogisticRegression(max_iter=1000, random_state=42)
model_1.fit(x_train,y_train) # fit on the training set
y_pred_1 = model_1.predict(X_test) # predict on the validation set
score_1 = round(accuracy_score(y_pred_1, y_test),2)
print("The Accuracy of Logistic Regression is :", score_1)
print(classification_report(y_pred_1, y_test))

print('\nConfusion Matrix on Test Data:\n------')
sns.set_style('white')
class_names = ['0','1']
plot_confusion_matrix(confusion_matrix(y_test, y_pred_1),
                      classes= class_names, normalize = True,
                      title='Logistic Regression')
plt.show()

### Model II: Support Vector Classifier

In [None]:
model_2 = SVC(random_state=42) # initiate the model
model_2.fit(x_train,y_train)
y_predict_2 = model_2.predict(X_test) # predict

score_2 = round(accuracy_score(y_predict_2, y_test),2)
print("The Accuracy of Support Vector Classifier is :", score_2)
print(classification_report(y_predict_2, y_test ))

print('\nConfusion Matrix on Test Data:\n------')
sns.set_style('white')
class_names = ['0','1']
plot_confusion_matrix(confusion_matrix(y_test, y_predict_2),
                      classes= class_names, normalize = True,
                      title='Support Vector Classifier')
plt.show()

### Model III : Random Forest Classifier

In [None]:
model_3 = RandomForestClassifier(random_state=42) #initiate the model
model_3.fit(x_train,y_train) #fit the model
y_predict_3 = model_3.predict(X_test) #predict
score_3 = round(accuracy_score(y_predict_3, y_test),2)
print("The Accuracy of the Random Forest is :", score_3)
print(classification_report(y_predict_3, y_test))

print('\nConfusion Matrix on Test Data:\n------')
sns.set_style('dark')
class_names = ['0','1']
plot_confusion_matrix(confusion_matrix(y_test, y_predict_3),
                      classes= class_names, normalize = True,
                      title='Random Forest')
plt.show()

### Model IV: Artificial Neural Network

In [None]:
model = Sequential()
model.add(Dense(16, input_dim = 200, activation ='relu'))
model.add(layers.Dropout(0.2))
model.add(Dense(8, activation ='relu'))  # additional hidden layer
model.add(layers.Dropout(0.2))
model.add(Dense(1, activation ='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')
model.summary()

In [None]:
history = model.fit(x_train, y_train, epochs = 10, verbose=True, batch_size = 100)
loss, accuracy = model.evaluate(x_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=True)
print("Testing Accuracy:  {:.4f}".format(accuracy))

### Summary Results

In [None]:
# create a dataframe for the avg f-1 scores of models
Models = ['RF', 'ANN', 'SVM', 'Logit'] # define list of models
Accuracy = [0.70, 0.61, 0.60, 0.56] # define list of performance measures
models_df = pd.DataFrame({'Accuracy': Accuracy}, index=Models) # build the dataframe
models_df = models_df.sort_values(ascending = True, by = 'Accuracy')
models_df;

In [None]:
# visualize these scores in bar plot
plt.figure(figsize=(10,8)) # figure size
ax = models_df.plot.bar(color ='royalblue')
ax.set_title('Performance Measures of Trained ML Models', fontweight="bold", size =12) #set the title
ax.set_xlabel('Model', fontsize = 12) # set x_label
ax.set_ylabel('Accuracy', fontsize = 12) # set y_label
plt.xticks(rotation=35, ha='center' )# rotate the x_axis
plt.xticks(fontsize=11)
plt.yticks(fontsize=11);

Based on the above results, the Random Forest Classifier will be shortlisted for Fine-tuning

## 5. Model Fine Tuning

### Randomized Search

n_estimators = number of trees in the foreset

max_features = max number of features considered for splitting a node

max_depth = max number of levels in each decision tree

min_samples_split = min number of data points placed in a node before the node is split

min_samples_leaf = min number of data points allowed in a leaf node

bootstrap = method for sampling data points (with or without replacement)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 8, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
random_search_RF = rf_random.fit(x_train, y_train)

In [None]:
#summarize RF results using Random Search
print("Best: %f using %s" % (round(random_search_RF .best_score_,2),
                             random_search_RF.best_params_))

### Final Model

In [None]:
# Get the best parameters found by RandomizedSearchCV
best_params = random_search_RF.best_params_
# Create a final model using the best parameters
final_model = RandomForestClassifier(**best_params)
final_model.fit(x_train,y_train) #fit the model
y_predict_final = final_model.predict(X_test) #predict
score_3 = round(accuracy_score(y_predict_final, y_test),2)
print("The Accuracy of the Final Model on the test-set is :", score_3)
print(classification_report(y_predict_final, y_test))

print('\nConfusion Matrix on Test Data:\n------')
sns.set_style('ticks')
class_names = ['0','1']
plot_confusion_matrix(confusion_matrix(y_test, y_predict_final),
                      classes= class_names, normalize = True,
                      title='Confusion Matrix of Final Model')
plt.show()

## 6. Test & Evaluation

In [None]:
content_ps = 'Smart textiles business KYMIRA makes sportswear with high-quality material, using sustainable manufacturing methods. The team aimed to keep operations in the UK and source materials locally, to fit their sustainable values. However, this turned out to be a high cost approach. It was also a struggle to find the advanced manufacturing techniques they were looking for.'
content_rs = 'If you think youâ€™ve found a potential problem in your business, the next step is to decide on the actions youâ€™re going to take in response. What you decide to do â€“ and how quickly you do it â€“ will be pivotal in determining whether the problem can be solved at a small scale or whether it develops into a new, bigger challenge.'

ps_t = Sentence(content_ps)
glove_embedding.embed(ps_t)
ps_t_processed = np.array(sum([np.matrix(token.embedding) for token in ps_t])/len(content_ps.split(' '))).ravel().reshape(1,-1)


ps_r = Sentence(content_rs)
glove_embedding.embed(ps_r)
ps_r_processed = np.array(sum([np.matrix(token.embedding) for token in ps_t])/len(content_rs.split(' '))).ravel().reshape(1,-1)


test = np.hstack((ps_t_processed, ps_r_processed))

prediction = final_model.predict(test)
print('Label between Statement and Resource is :', 'positive' if prediction[0]==1 else 'negative')