In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

import scipy.sparse
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression as lm
from scipy import stats

import gensim
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize, punkt
from gensim.models import KeyedVectors

import matplotlib.collections as plt1
import matplotlib.pyplot as plt2
from matplotlib.legend_handler import HandlerLineCollection, HandlerTuple
from matplotlib.lines import Line2D
import seaborn as sns

import pandas as pd
pd.set_option('display.max_columns', 50)

import pickle
import time
import re
import csv
import math
import pickle
import time
import spacy

nltk.download('punkt')

In [None]:
#data frame with only labeled abstracts
df_lab = pd.read_csv("/home/kno5cac/git/publicrd/data/prd/Digital_abstract_labelled/labelled_abstracts.csv")

#data frame with all of the abstracts
df = pd.read_pickle("/home/kno5cac/git/publicrd/data/prd/Paper/FR_meta_and_final_tokens_23DEC21.pkl")

In [None]:
# Remove Null Abstracts. Reset the index and create a variable index to link with PROJECT_ID
df = df[~df.ABSTRACT.isnull()]
df.reset_index(inplace = True, drop = True)
df['index'] = df.index
df.head()

In [None]:
df_lab = df_lab[['PROJECT_ID','ABSTRACT','Is it related to Big-Data','label']]
df_lab

In [None]:
# merge with the labelled data using PROJECT_ID.
df['PROJECT_ID'] = pd.to_numeric(df['PROJECT_ID'])
df_merge = df.merge(df_lab[['PROJECT_ID','Is it related to Big-Data','label']], how='left', on='PROJECT_ID')
len(df_merge)

In [None]:
df_merge

In [None]:
# Save project ID of labelled data
project_id_lab = list(df_lab['PROJECT_ID'])

# Get the index of labelled abstract
subset_df = df_merge.loc[df_merge['PROJECT_ID'].isin(project_id_lab),['index', 'PROJECT_ID']]
index_lab = list(subset_df['index'])

# 1. Doc2Vec approach

## 1.1. Vectorize, build the training and test sample

In [None]:
#data = df['ABSTRACT']

In [None]:
#indicies = df.index

In [None]:
#tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
#tagged_df = pd.DataFrame(tagged_data)

In [None]:
# choose k - number of abstracts for analysis
k = 200000 

In [None]:
# creating subset of dataset
data = df['ABSTRACT']
df = df[0:k]
data = data[0:k,]
indicies = data.index
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
tagged_df = pd.DataFrame(tagged_data)

In [None]:
# choosing indexes that are inside the range 0-2000000
m = 0 
for i in index_lab:
        if i < k:
            m = m + 1
            

index_lab = index_lab[0:m]
project_id_lab = project_id_lab[0:m]

In [None]:
# Create the Doc2Vec model and train using labelled data
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, epochs=100) #min_count=2 - min number of times it shows up
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")

In [None]:
# Load the Doc2Vec model
model = Doc2Vec.load("d2v.model")

In [None]:
vector = []
for i in range(0, len(tagged_df)):
    vectorthis = (model.dv[tagged_df['tags'][i]]).tolist()
    vector.append(vectorthis)

In [None]:
mat = []
for i in vector:
    for x in i:
        mat.append(x)

In [None]:
training = pd.DataFrame(mat)
training

In [None]:
#training model on labeled and whole corpus
trainingnp = training.to_numpy()

In [None]:
# Build the training and test sample: randomly select 80% of index_lab as training and the rest as test
import random

# Build the index for the training and the test
index_training = random.sample(index_lab, int(0.8 * len(index_lab)))
index_test = list(set(index_lab) - set(index_training))

# sort those index (sort help to extract the Y value for each set)
index_training.sort()
index_test.sort()

# convert to lists
index_training = list(index_training)
index_test = list(index_test)

# Extract those index in the Doc2Vec matrix and compute the X training and Test
X_train_doc2vec = [mat[i] for i in index_training]
X_test_doc2vec = [mat[i] for i in index_test]

In [None]:
# Create Y for training and test using the index order from the sampling
Y_train_doc2vec = [df_merge.loc[df_merge['index']== i,'label'].values.tolist() for i in index_training]
Y_train_doc2vec = sum(Y_train_doc2vec, [])

Y_test_doc2vec = [df_merge.loc[df_merge['index']== i,'label'].values.tolist() for i in index_test]
Y_test_doc2vec = sum(Y_test_doc2vec, [])

## 1.2. KNN classifier

In [None]:
# number of neighbors
k = int(math.sqrt(len(Y_train_doc2vec)))
print('Number of neighbors:', k)

In [None]:
# Run KNN
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors= k)
classifier.fit(X_train_doc2vec,Y_train_doc2vec)

In [None]:
# Predict the classification
Y_pred_doc2vec = classifier.predict(X_test_doc2vec)

In [None]:
# Classifier performance
from sklearn.metrics import classification_report, confusion_matrix

confusion = confusion_matrix(Y_test_doc2vec, Y_pred_doc2vec)
#print(confusion)
print(classification_report(Y_test_doc2vec, Y_pred_doc2vec))

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(confusion, annot=labels, fmt='', cmap='Blues')
ax.set_title('KNN Confusion Matrix\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
## Display the visualization of the Confusion Matrix.
plt2.show()
plt2.savefig('doc2vec_KNN.PNG')

In [None]:
# Predict the classification for the whole data: use the whole Doc2Vec
Y_doc2vec = classifier.predict(mat)

# The Doc2Vec keep the order from index. Just add the prediction as new variable
df['Prediction_KNN'] = Y_doc2vec
df.head()

In [None]:
# Count the number of projects that is Big data related
df_bigdata = df[df['Prediction_KNN']=='Big-data']
print('Number of abstracts related to Big data:', len(df_bigdata))

In [None]:
cb_pal = {"blue":'#377eb8', "orange":'#ff7f00', "green":'#4daf4a', "pink":'#f781bf', "brown":'#a65628', 
          "purple":'#984ea3', "gray":'#999999', "red":'#e41a1c', "yellow":'#dede00'}

In [None]:
# Distribution over time and agencies
df_bigdata["FY"] = df_bigdata["FY"].astype('int')
year_counts = df_bigdata['FY'].value_counts().sort_index(ascending=True)

# Distribution
year = year_counts.index.tolist()
count = year_counts.values

fig = plt2.figure()
ax = fig.add_subplot(111)
ax.set_axisbelow(True)
plt2.grid(True, color = "whitesmoke")
plt2.bar(year, count, color='navy')
#plt.xlim(-0.7, len(year)-0.3)
plt2.xlim(2007.3,2020.7)
plt2.xticks(year, rotation=45)
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
#x_ticks = ax.xaxis.get_major_ticks()
#x_ticks[-1].label1.set_visible(False)
plt2.xlabel("FY")
plt2.ylabel("Number of Projects")
plt2.title("Big data sample")

plt2.savefig("big_data_time.png", dpi = 800, bbox_inches = "tight")

In [None]:
agency_counts = 100*df_bigdata["DEPARTMENT"].value_counts()/len(df_bigdata)
agency = agency_counts.index.tolist()
count = agency_counts.values

fig = plt2.figure()
ax = fig.add_subplot(111)
ax.set_axisbelow(True)
plt2.grid(True, color = "whitesmoke")
plt2.bar(agency, count, color=cb_pal['blue'])
plt2.ylim(0,100)
plt2.xlabel("Agency")
plt2.ylabel("Percent of Dataset")
plt2.title("Project Distribution by Funding Agency")

plt2.savefig("big_data_agency.png", dpi = 800, bbox_inches = "tight")

In [None]:
# save the data
#df_bigdata.to_csv("/project/biocomplexity/sdad/projects_data/ncses/prd/Digital_abstract_labelled/doc2vec_method.csv")

## 1.3. SVM

In [None]:
# We run SVM with default parameters provides by Sklearn. We train the model
from sklearn import svm
classifier_svm = svm.SVC()
classifier_svm.fit(X_train_doc2vec,Y_train_doc2vec)

In [None]:
# Predict the classification
Y_pred_doc2vec = classifier_svm.predict(X_test_doc2vec)

In [None]:
# Classifier performance
from sklearn.metrics import classification_report, confusion_matrix

confusion = confusion_matrix(Y_test_doc2vec, Y_pred_doc2vec)
#print(confusion)
print(classification_report(Y_test_doc2vec, Y_pred_doc2vec))

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(confusion, annot=labels, fmt='', cmap='Blues')
ax.set_title('KNN Confusion Matrix\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
## Display the visualization of the Confusion Matrix.
plt2.show()
plt2.savefig('doc2vec_SVM.PNG')

In [None]:
# Predict the classification for the whole data: use the whole Doc2Vec
Y_doc2vec = classifier_svm.predict(mat)

# The Doc2Vec keep the order from index. Just add the prediction as new variable
df['Prediction_SVM'] = Y_doc2vec
df.head()

In [None]:
# Cross comparision between SVM and KNN
pd.crosstab(df['Prediction_KNN'], df['Prediction_SVM'], margins=True, margins_name="Total")

## 1.4. Decision Tree

In [None]:
# Use default parameters provides by SKLearn and train the model
from sklearn.tree import DecisionTreeClassifier
classifier_DT = DecisionTreeClassifier()
classifier_DT.fit(X_train_doc2vec,Y_train_doc2vec)

In [None]:
# Predict the classification
Y_pred_doc2vec = classifier_DT.predict(X_test_doc2vec)

In [None]:
# Classifier performance
from sklearn.metrics import classification_report, confusion_matrix

confusion = confusion_matrix(Y_test_doc2vec, Y_pred_doc2vec)
#print(confusion)
print(classification_report(Y_test_doc2vec, Y_pred_doc2vec))

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(confusion, annot=labels, fmt='', cmap='Blues')
ax.set_title('KNN Confusion Matrix\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
## Display the visualization of the Confusion Matrix.
plt2.show()
plt2.savefig('doc2vec_DT.PNG')

In [None]:
# Predict the classification for the whole data: use the whole Doc2Vec
Y_doc2vec = classifier_DT.predict(mat)

# The Doc2Vec keep the order from index. Just add the prediction as new variable
df['Prediction_DT'] = Y_doc2vec
df.head()

In [None]:
# Cross comparision between NN and SVM
pd.crosstab(df['Prediction_DT'], df['Prediction_SVM'], margins=True, margins_name="Total")

In [None]:
# Cross comparision between NN and KNN
pd.crosstab(df['Prediction_DT'], df['Prediction_KNN'], margins=True, margins_name="Total")

## 1.5. Random Forest

In [None]:
# Use default parameters provides by SKLearn and train the model
from sklearn.ensemble import RandomForestClassifier
classifier_RF = RandomForestClassifier()
classifier_RF.fit(X_train_doc2vec,Y_train_doc2vec)

In [None]:
# Predict the classification
Y_pred_doc2vec = classifier_RF.predict(X_test_doc2vec)

In [None]:
# Classifier performance
from sklearn.metrics import classification_report, confusion_matrix

confusion = confusion_matrix(Y_test_doc2vec, Y_pred_doc2vec)
#print(confusion)
print(classification_report(Y_test_doc2vec, Y_pred_doc2vec))

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(confusion, annot=labels, fmt='', cmap='Blues')
ax.set_title('KNN Confusion Matrix\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
## Display the visualization of the Confusion Matrix.
plt2.show()
plt2.savefig('doc2vec_RF.PNG')

In [None]:
# Predict the classification for the whole data: use the whole Doc2Vec
Y_doc2vec = classifier_RF.predict(mat)

# The Doc2Vec keep the order from index. Just add the prediction as new variable
df['Prediction_RF'] = Y_doc2vec
df.head()

In [None]:
# Cross comparision between RF and DT
pd.crosstab(df['Prediction_RF'], df['Prediction_DT'], margins=True, margins_name="Total")

In [None]:
# Cross comparision between RF and SVM
pd.crosstab(df['Prediction_RF'], df['Prediction_SVM'], margins=True, margins_name="Total")

In [None]:
# Cross comparision between RF and SVM
pd.crosstab(df['Prediction_RF'], df['Prediction_KNN'], margins=True, margins_name="Total")

## 1.4. Neural network classifier

In [None]:
# Use default parameters provides by SKLearn and train the model
from sklearn.neural_network import MLPClassifier
classifier_nn = MLPClassifier()
classifier_nn.fit(X_train_doc2vec,Y_train_doc2vec)

In [None]:
# Predict the classification
Y_pred_doc2vec = classifier_nn.predict(X_test_doc2vec)

In [None]:
# Classifier performance
from sklearn.metrics import classification_report, confusion_matrix

confusion = confusion_matrix(Y_test_doc2vec, Y_pred_doc2vec)
#print(confusion)
print(classification_report(Y_test_doc2vec, Y_pred_doc2vec))

In [None]:
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
ax = sns.heatmap(confusion, annot=labels, fmt='', cmap='Blues')
ax.set_title('KNN Confusion Matrix\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
## Display the visualization of the Confusion Matrix.
plt2.show()
plt2.savefig('doc2vec_NN.PNG')

In [None]:
# Predict the classification for the whole data: use the whole Doc2Vec
Y_doc2vec = classifier_nn.predict(mat)

# The Doc2Vec keep the order from index. Just add the prediction as new variable
df['Prediction_NN'] = Y_doc2vec
df.head()

In [None]:
# Cross comparision between NN and KNN
pd.crosstab(df['Prediction_NN'], df['Prediction_KNN'], margins=True, margins_name="Total")

In [None]:
# Cross comparision between NN and SVM
pd.crosstab(df['Prediction_NN'], df['Prediction_SVM'], margins=True, margins_name="Total")

In [None]:
# Cross comparision between NN and DT
pd.crosstab(df['Prediction_NN'], df['Prediction_DT'], margins=True, margins_name="Total")

In [None]:
# Cross comparision between NN and RF
pd.crosstab(df['Prediction_NN'], df['Prediction_RF'], margins=True, margins_name="Total")

In [None]:
# save the data
df.to_csv("/project/biocomplexity/sdad/projects_data/ncses/prd/Digital_abstract_labelled/FR_final_predicted_doc2vec.csv")

## 1.5. Decision rule based on all classifiers

In [None]:
# Compute a score to be identify as big data
df = pd.read_csv("/project/biocomplexity/sdad/projects_data/ncses/prd/Digital_abstract_labelled/FR_final_predicted_doc2vec.csv")
df['score'] = np.where(df['Prediction_KNN'].str.contains("Non Big-data"), 0, 1) + np.where(df['Prediction_SVM'].str.contains("Non Big-data"), 0, 1) + np.where(df['Prediction_DT'].str.contains("Non Big-data"), 0, 1) + np.where(df['Prediction_RF'].str.contains("Non Big-data"), 0, 1) + np.where(df['Prediction_NN'].str.contains("Non Big-data"), 0, 1)

In [None]:
df.head()

In [None]:
# Distribution of abstracts
df['score'].value_counts()

In [None]:
# Used a rule (majority votes from classifiers) to identify Big-data abstracts. Useful because we have an impair number of classifier
df['Big_data'] = 0
df.loc[df['score']>2,'Big_data'] = 1
df['Big_data'].value_counts()

In [None]:
# subset the data to big-data and save
df_bigdata = df[df['Big_data']==1]
df_bigdata.to_csv("/project/biocomplexity/sdad/projects_data/ncses/prd/Digital_abstract_labelled/abstracts_classification_big_data_doc2vec.csv")

## Descriptive statistics of our Big data

In [None]:
# Distribution over time and agencies
df_bigdata["FY"] = df_bigdata["FY"].astype('int')
year_counts = df_bigdata['FY'].value_counts().sort_index(ascending=True)

# Distribution
year = year_counts.index.tolist()
count = year_counts.values

fig = plt2.figure()
ax = fig.add_subplot(111)
ax.set_axisbelow(True)
plt2.grid(True, color = "whitesmoke")
plt2.bar(year, count, color='navy')
#plt.xlim(-0.7, len(year)-0.3)
plt2.xlim(2007.3,2020.7)
plt2.xticks(year, rotation=45)
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
#x_ticks = ax.xaxis.get_major_ticks()
#x_ticks[-1].label1.set_visible(False)
plt2.xlabel("FY")
plt2.ylabel("Number of Projects")
plt2.title("Big data sample")

plt2.savefig("big_data_time.png", dpi = 800, bbox_inches = "tight")

In [None]:
agency_counts = 100*df_bigdata["DEPARTMENT"].value_counts()/len(df_bigdata)
agency = agency_counts.index.tolist()
count = agency_counts.values

fig = plt2.figure()
ax = fig.add_subplot(111)
ax.set_axisbelow(True)
plt2.grid(True, color = "whitesmoke")
plt2.bar(agency, count, color=cb_pal['blue'])
plt2.ylim(0,100)
plt2.xlabel("Agency")
plt2.ylabel("Percent of Dataset")
plt2.title("Project Distribution by Funding Agency")

plt2.savefig("big_data_agency.png", dpi = 800, bbox_inches = "tight")