## Medical Device Recalls

### Step 3 - Prepare Data

Using NLP to preprocess the reason for recall texts and tokenize the texts for machine learning. 

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# We can also use the two modules to select the best features using Chi-squared statistical method
# Will try this later. In the meantime, we use TdidfVEctorizer's max_features option based on frequency
# from sklearn.feature_selection import SelectKBest, chi2 

pd.set_option("display.max_colwidth", None)

In [4]:
df = pd.read_csv("../data/source/enforcement_reports.csv", usecols=["recall_number", "reason_for_recall","classification"])

df.sample().T

Unnamed: 0,2398
reason_for_recall,A potential for dual-configured beds to separate or slip away from one another.
classification,Class II
recall_number,Z-1499-2022


In [5]:
corpus = df["reason_for_recall"]
vect = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,1),
    max_features=500
)
vect.fit(corpus)
feature_names = vect.get_feature_names_out()
X = vect.transform(corpus)  
df_X=  pd.DataFrame(X.toarray(), columns=feature_names, index=df.index)

df_X.shape

(6904, 500)

In [6]:
df_X.sample(10)

Unnamed: 0,00,01,06,09,10,11,2018,2020,2021,2022,...,ventilation,ventilator,version,versions,vitros,void,volume,water,wear,wire
6001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_X.to_csv("../data/prepared/features_tfidf.csv", index=False)

# The End.