In [147]:
import os
from pathlib import Path
import logging
import itertools

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer

from capstone.config import CapstoneConfig
from capstone.data_access import DataClass

PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)


config = CapstoneConfig()
config.CURRENT_PATH = PARENT_PATH
data = DataClass(config)

In [148]:

xls = pd.ExcelFile(os.path.join(data.data_path, "Regulatory Requirements.xlsx"))
df = pd.read_excel(xls, "Requirements List")
df.head(2)

Unnamed: 0,id,Regulation,Title / Chapter / Sub part,Chapter / Section,Article / Paragraph,Text,Date,Product Type,GxP,Regulatory Requirement Type,Full article reference,Mapped to a standard?,Standard mapping?
0,1,Brazil ANVISA GMP MD,Chapter 2 General Quality System Requirements,2.1. General Provisions,2.1.1,General Quality System Requirements […] Genera...,2013-03-28 00:00:00,Medical Device,GMP,Core,Brazil ANVISA GMP MD 2.1.1,Mapped to a standard,Regulatory Standards Management
1,2,Brazil ANVISA GMP MD,Chapter 2 General Quality System Requirements,2.2. Management responsibility,2.2.1,General Quality System Requirements […] Manage...,2013-03-28 00:00:00,Medical Device,GMP,Core,Brazil ANVISA GMP MD 2.2.1,Mapped to a standard,"Quality Planning, \nQuality Management"


In [149]:
TEXT = "Text"
TARGET = "Standard mapping?"
REQUIREMENT = "Regulatory Requirement Type"
IS_MAPPED = "Mapped to a standard?"
IS_MAPPED_TRUE = "Mapped to a standard"

In [150]:
df.value_counts(IS_MAPPED)

Mapped to a standard?
No mapping              6326
Mapped to a standard    3906
dtype: int64

In [151]:
requirement_map = {
    "Non-Core": "Non-core",
    "Non-core": "Non-core",
    "non-core": "Non-core",
    "Core": "Core",
    "Obsolete": "Obsolete"
}
df[REQUIREMENT] = df[REQUIREMENT].map(requirement_map)
df[REQUIREMENT].value_counts()

Core        6316
Non-core    3067
Obsolete     849
Name: Regulatory Requirement Type, dtype: int64

In [152]:
df = df[df[IS_MAPPED].isin([IS_MAPPED_TRUE])].copy()
df.reset_index(drop=True, inplace=True)
# Keep only relevant columns
temp = df[[TEXT, TARGET]].copy()
temp.isna().sum()

Text                 0
Standard mapping?    0
dtype: int64

In [153]:
# Convert target column into list of labels
temp[TARGET] = \
    temp[TARGET].str.lower() \
    .apply(lambda x: list(x.split("\n"))) \
    .apply(lambda x: [y.split(",") for y in x]) \
    .apply(lambda x: list(itertools.chain(*x))) \
    .apply(lambda x: [y.strip().replace("-", " ") for y in x if y.strip() != ""]).copy()

In [154]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(["shall"])
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
## Clean text (regluations)
# Conver to lower case
temp[TEXT] = temp[TEXT].str.lower()
# Remove HTML
temp[TEXT] = temp[TEXT].str.replace("<[^<]+?>", " ", regex=True)
# Remove symbols
temp[TEXT] = temp[TEXT].str.replace(r"[/(){}\[\]\|@,;\-]", " ", regex=True)
# Remove punctuation
temp[TEXT] = temp[TEXT].str.replace(r"[^\w\s]", " ", regex=True)
# Tokenize
temp[TEXT] = temp[TEXT].apply(lambda x: word_tokenize(x))
# Remove stopwords
temp[TEXT] = temp[TEXT].apply(lambda x: [word for word in x if word not in stop_words])
# Lemmatize
temp[TEXT] = temp[TEXT].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
# Join
temp[TEXT] = temp[TEXT].apply(lambda x: " ".join(x))

In [155]:
for i in range(temp.shape[0]):
    text = "###\n"
    text += temp[TEXT][i]
    text += "\n\nClassification:\n"
    labels = list(temp[TARGET][i])
    for j, label in enumerate(labels):
        text = text + "Class " + str(j+1) + ": " + label + "\n"
    text += "###"
    temp.at[i,TEXT] = text
    

In [156]:
# list(temp[TARGET][2])

In [162]:
print(temp[TEXT][4])
print(temp[TARGET][4])

###
general quality system requirement management responsibility resource personnel verification activity manufacturer establish function verification activity provide appropriate resource designates trained personnel perform activity verification

Classification:
Class 1: training
Class 2: samd
Class 3: quality management
###
['training', 'samd', 'quality management']


In [163]:
temp

Unnamed: 0,Text,Standard mapping?
0,###\ngeneral quality system requirement genera...,[regulatory standards management]
1,###\ngeneral quality system requirement manage...,"[quality planning, quality management]"
2,###\ngeneral quality system requirement manage...,[quality management]
3,###\ngeneral quality system requirement manage...,[quality management]
4,###\ngeneral quality system requirement manage...,"[training, samd, quality management]"
...,...,...
3901,###\npost market surveillance system suited ac...,[medical device post market surveillance]
3902,###\ndata gathered manufacturer post market su...,[medical device post market surveillance]
3903,###\ncourse post market surveillance need prev...,[medical device post market surveillance]
3904,###\neconomic operator mean manufacturer autho...,[medical device post market surveillance]


In [None]:
# Output to CSV