# **Sistem Rekomendasi Obat Menggunakan Model Content Based Filtering**

In [None]:
import pandas as pd #library memproses data (pembersihan data, manipulasi data, hingga melakukan analisis data)
import numpy as np #library untuk mengolah matriks
#sklearn:support machine learning dengan mendukung berbagai algoritma seperti regresi linier, klasifikasi, pengelompokan, dan lain sebagainya
from sklearn.metrics.pairwise import cosine_similarity #menghitung kemiripan antar dokumen
from sklearn.feature_extraction.text import TfidfVectorizer #menghitung frekuensi kata pada setiap dokumen kemudian mengubah matriks kata-kata tersebut menjadi skor TF-IDF
from nltk.corpus import stopwords #kata yang tidak penting
import re #urutan karakter yang membentuk pola pencarian
import random #menghasilkan angka acak
import nltk #Natural Language Tool Kit, library untuk membantu memproses teks
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Menginput data
data = pd.read_csv("Data Fix.csv", sep=';')

In [None]:
# Melihat 5 data teratas
data.head()

Unnamed: 0,Nama_Obat,Manufacturer,Content,Indications,Contraindications,Adverse_Reactions,Diseases,MIMS_Class
0,Atarax,Mersifarma TM,Alprazolam,Short-term therapy of moderate or severe anxie...,Hypersensitivity to benzodiazepines. Acute nar...,"Drowsiness, ataxia/muscle weakness, amnesia, d...",Anxiety; Insomnia; Premenstrual Dysphoric Diso...,Anxiolytics
1,Merlopam,Mersifarma TM,Lorazepam,Short-term treatment of anxiety or anxiety-rel...,Narrow-angle glaucoma; severe resp insufficien...,"Sedation followed by dizziness, weakness & uns...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics
2,Proclozam,Meprofarm,Clobazam,"Conditions associated with anxiety, tension, s...",Hypersensitivity. Myasthenia gravis. History o...,"Fatigue, dry mouth, constipation, loss of appe...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics
3,Renaquil,Fahrenheit,Lorazepam,"Anxiety-related disorders, emotionally-induced...",Primary depressive disorders; primary treatmen...,"Drowsiness, dizziness, clumsiness, sedation, t...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics
4,Valdimex,Mersifarma TM,Diazepam,Short-term management of anxiety. Adjunct in t...,Resp depression; acute pulmonary insufficiency...,"Drowsiness, lightheadedness, ataxia, amnesia, ...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283 entries, 0 to 282
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Nama_Obat          283 non-null    object
 1   Manufacturer       283 non-null    object
 2   Content            283 non-null    object
 3   Indications        283 non-null    object
 4   Contraindications  283 non-null    object
 5   Adverse_Reactions  283 non-null    object
 6   Diseases           283 non-null    object
 7   MIMS_Class         283 non-null    object
dtypes: object(8)
memory usage: 17.8+ KB


In [None]:
data.isna().sum()

Nama_Obat            0
Manufacturer         0
Content              0
Indications          0
Contraindications    0
Adverse_Reactions    0
Diseases             0
MIMS_Class           0
dtype: int64

In [None]:
# Melihat data Duplicate pada dataset
data.duplicated().sum()

0

In [None]:
#Menggabung 5 variabel
def combine5col(colA,colB,colC,colD,colE):
  new_col = []
  for i in range(len(colA)):
    txt = str(colA[i])+" "+str(colB[i])+" "+str(colC[i])+" "+str(colD[i])+" "+str(colE[i])
    new_col.append(txt)
  return new_col

In [None]:
data['Description'] = combine5col(data['Content'],data['Indications'],data['Contraindications'],data['Adverse_Reactions'],data['Diseases'])
data

Unnamed: 0,Nama_Obat,Manufacturer,Content,Indications,Contraindications,Adverse_Reactions,Diseases,MIMS_Class,Description
0,Atarax,Mersifarma TM,Alprazolam,Short-term therapy of moderate or severe anxie...,Hypersensitivity to benzodiazepines. Acute nar...,"Drowsiness, ataxia/muscle weakness, amnesia, d...",Anxiety; Insomnia; Premenstrual Dysphoric Diso...,Anxiolytics,Alprazolam Short-term therapy of moderate or s...
1,Merlopam,Mersifarma TM,Lorazepam,Short-term treatment of anxiety or anxiety-rel...,Narrow-angle glaucoma; severe resp insufficien...,"Sedation followed by dizziness, weakness & uns...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,Lorazepam Short-term treatment of anxiety or a...
2,Proclozam,Meprofarm,Clobazam,"Conditions associated with anxiety, tension, s...",Hypersensitivity. Myasthenia gravis. History o...,"Fatigue, dry mouth, constipation, loss of appe...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,"Clobazam Conditions associated with anxiety, t..."
3,Renaquil,Fahrenheit,Lorazepam,"Anxiety-related disorders, emotionally-induced...",Primary depressive disorders; primary treatmen...,"Drowsiness, dizziness, clumsiness, sedation, t...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,"Lorazepam Anxiety-related disorders, emotional..."
4,Valdimex,Mersifarma TM,Diazepam,Short-term management of anxiety. Adjunct in t...,Resp depression; acute pulmonary insufficiency...,"Drowsiness, lightheadedness, ataxia, amnesia, ...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,Diazepam Short-term management of anxiety. Adj...
...,...,...,...,...,...,...,...,...,...
278,Plegicol,Ikapharmindo,Citicoline,"Unconsciousness due to brain damage, head inju...",Hypertonia of the parasympathetic nervous system.,"Epigastric distress, nausea; skin redness; hea...",Alzheimer's Disease and Dementia; Parkinson's ...,Peripheral Vasodilators & Cerebral Activators ...,Citicoline Unconsciousness due to brain damage...
279,Revolan,Sanbe,Piracetam,"Post-traumatic symptoms, involutional symptoms...",Hypersensitivity. Severe renal insufficiency C...,"Nervousness, irritability, insomnia, anxiety, ...",Epilepsy (Pediatric); Alzheimer's Disease and ...,Nootropics & Neurotonics/Neurotrophics,"Piracetam Post-traumatic symptoms, involutiona..."
280,Rincobal,Yarindo Farmatama,Mecobalamin,Peripheral neuropathy.,Hypersensitivity.,"Reduced appetite, nausea, anorexia, vomiting, ...",Neuropathic Pain; Vertigo.,Nootropics & Neurotonics/Neurotrophics,Mecobalamin Peripheral neuropathy. Hypersensit...
281,Sevotam 800,Ifars,Piracetam,"Involutional symptoms related to aging eg, mem...",Severe kidney function impairment (CrCl <20 mL...,"Tremor, insomnia, fatigue, drowsiness, anxiety...",Epilepsy (Pediatric); Alzheimer's Disease and ...,Nootropics & Neurotonics/Neurotrophics,Piracetam Involutional symptoms related to agi...


# **TEXT PRE-PROCESSING**
- Case Folding

---
- Remove Punctuation
---
- Stopword Removal
---
- Lemmatization
---
- Tokenizing
---






**Case Folding**

In [None]:
#Mengubah Menjadi Huruf Kecil (Lower Casing/Case folding)
data['description_clear'] = data['Description'].apply(lambda x:" ". join(x.lower() for x in x.split()))

In [None]:
data['description_clear']

0      alprazolam short-term therapy of moderate or s...
1      lorazepam short-term treatment of anxiety or a...
2      clobazam conditions associated with anxiety, t...
3      lorazepam anxiety-related disorders, emotional...
4      diazepam short-term management of anxiety. adj...
                             ...                        
278    citicoline unconsciousness due to brain damage...
279    piracetam post-traumatic symptoms, involutiona...
280    mecobalamin peripheral neuropathy. hypersensit...
281    piracetam involutional symptoms related to agi...
282    citicoline unconsciousness due to head trauma ...
Name: description_clear, Length: 283, dtype: object

In [None]:
cek = data.loc[93, 'description_clear']
cek

'valproic acid monotherapy or adjunctive therapy on the treatment of partial seizures (simple & complex) & absence seizure (petit mal seizures). hypersensitivity. patients with hepatic disease or hepatic dysfunction. nausea, vomiting, indigestion, diarrhea, abdominal pain, constipation, decreased appetite & weight loss or increased appetite & weight gain, sedation, tremor, ataxia, headache, nystagmus, diplopia, asterixis, dysarthria, impaired coordination, alopecia/hair loss, skin rash, erythema multiforme, sjs, emotional upset, depression, psychosis, aggressions, hyperactivity, weakness, thrombocytopenia, hemorrhage, bruising, anemia, bone marrow suppresion, elevated transaminase & bilirubin, abnormal thyroid function, amenorrhagia, galactorrhea, acute pancreatitis, hyperammonemia, hyperglycinemia, edema on extremities. epilepsy; epilepsy (pediatric); headache; migraine headache; bipolar disorder; rheumatic fever - acute.'

In [None]:
casefolding = pd.DataFrame(data['description_clear'])
casefolding

Unnamed: 0,description_clear
0,alprazolam short-term therapy of moderate or s...
1,lorazepam short-term treatment of anxiety or a...
2,"clobazam conditions associated with anxiety, t..."
3,"lorazepam anxiety-related disorders, emotional..."
4,diazepam short-term management of anxiety. adj...
...,...
278,citicoline unconsciousness due to brain damage...
279,"piracetam post-traumatic symptoms, involutiona..."
280,mecobalamin peripheral neuropathy. hypersensit...
281,piracetam involutional symptoms related to agi...


In [None]:
file_name = 'case folding.xlsx'

# saving the excel
casefolding.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


**Remove Puctuation**

In [None]:
# Menghapus Tanda Baca (Removing Puntuation)
data['clean_punct'] = data['description_clear'].str.replace('[^\w\s]',' ',regex = True)

In [None]:
data['clean_punct']

0      alprazolam short term therapy of moderate or s...
1      lorazepam short term treatment of anxiety or a...
2      clobazam conditions associated with anxiety  t...
3      lorazepam anxiety related disorders  emotional...
4      diazepam short term management of anxiety  adj...
                             ...                        
278    citicoline unconsciousness due to brain damage...
279    piracetam post traumatic symptoms  involutiona...
280    mecobalamin peripheral neuropathy  hypersensit...
281    piracetam involutional symptoms related to agi...
282    citicoline unconsciousness due to head trauma ...
Name: clean_punct, Length: 283, dtype: object

In [None]:
cek = data.loc[93, 'clean_punct']
cek

'valproic acid monotherapy or adjunctive therapy on the treatment of partial seizures  simple   complex    absence seizure  petit mal seizures   hypersensitivity  patients with hepatic disease or hepatic dysfunction  nausea  vomiting  indigestion  diarrhea  abdominal pain  constipation  decreased appetite   weight loss or increased appetite   weight gain  sedation  tremor  ataxia  headache  nystagmus  diplopia  asterixis  dysarthria  impaired coordination  alopecia hair loss  skin rash  erythema multiforme  sjs  emotional upset  depression  psychosis  aggressions  hyperactivity  weakness  thrombocytopenia  hemorrhage  bruising  anemia  bone marrow suppresion  elevated transaminase   bilirubin  abnormal thyroid function  amenorrhagia  galactorrhea  acute pancreatitis  hyperammonemia  hyperglycinemia  edema on extremities  epilepsy  epilepsy  pediatric   headache  migraine headache  bipolar disorder  rheumatic fever   acute '

**Remove Double Whitespace (Optional)**

In [None]:
def _normalize_whitespace(text):
    """
    This function normalizes whitespaces, removing duplicates.
    """
    corrected = str(text)
    corrected = re.sub(r"//t",r"\t", corrected)
    corrected = re.sub(r"( )\1+",r"\1", corrected)
    corrected = re.sub(r"(\n)\1+",r"\1", corrected)
    corrected = re.sub(r"(\r)\1+",r"\1", corrected)
    corrected = re.sub(r"(\t)\1+",r"\1", corrected)
    return corrected.strip(" ")
data['clean_double_ws'] = data['clean_punct'].apply(_normalize_whitespace)

In [None]:
data['clean_double_ws']

0      alprazolam short term therapy of moderate or s...
1      lorazepam short term treatment of anxiety or a...
2      clobazam conditions associated with anxiety te...
3      lorazepam anxiety related disorders emotionall...
4      diazepam short term management of anxiety adju...
                             ...                        
278    citicoline unconsciousness due to brain damage...
279    piracetam post traumatic symptoms involutional...
280    mecobalamin peripheral neuropathy hypersensiti...
281    piracetam involutional symptoms related to agi...
282    citicoline unconsciousness due to head trauma ...
Name: clean_double_ws, Length: 283, dtype: object

In [None]:
cek = data.loc[93, 'clean_double_ws']
cek

'valproic acid monotherapy or adjunctive therapy on the treatment of partial seizures simple complex absence seizure petit mal seizures hypersensitivity patients with hepatic disease or hepatic dysfunction nausea vomiting indigestion diarrhea abdominal pain constipation decreased appetite weight loss or increased appetite weight gain sedation tremor ataxia headache nystagmus diplopia asterixis dysarthria impaired coordination alopecia hair loss skin rash erythema multiforme sjs emotional upset depression psychosis aggressions hyperactivity weakness thrombocytopenia hemorrhage bruising anemia bone marrow suppresion elevated transaminase bilirubin abnormal thyroid function amenorrhagia galactorrhea acute pancreatitis hyperammonemia hyperglycinemia edema on extremities epilepsy epilepsy pediatric headache migraine headache bipolar disorder rheumatic fever acute'

In [None]:
remove_punctuation = pd.DataFrame(data['clean_double_ws'])

file_name = 'remove punctuation.xlsx'

# saving the excel
remove_punctuation.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


**Stopword Removal**

In [None]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
#clean stopwords
stopword = set(stopwords.words('english'))

def clean_stopwords(text):
    text = ' '.join(word for word in text.split() if word not in stopword) # hapus stopword dari kolom deskripsi
    return text

# Buat kolom tambahan untuk data description yang telah distopwords removal
data['clean_sw'] = data['clean_double_ws'].apply(clean_stopwords)

In [None]:
data['clean_sw']

0      alprazolam short term therapy moderate severe ...
1      lorazepam short term treatment anxiety anxiety...
2      clobazam conditions associated anxiety tension...
3      lorazepam anxiety related disorders emotionall...
4      diazepam short term management anxiety adjunct...
                             ...                        
278    citicoline unconsciousness due brain damage he...
279    piracetam post traumatic symptoms involutional...
280    mecobalamin peripheral neuropathy hypersensiti...
281    piracetam involutional symptoms related aging ...
282    citicoline unconsciousness due head trauma bra...
Name: clean_sw, Length: 283, dtype: object

In [None]:
cek = data.loc[93, 'clean_sw']
cek

'valproic acid monotherapy adjunctive therapy treatment partial seizures simple complex absence seizure petit mal seizures hypersensitivity patients hepatic disease hepatic dysfunction nausea vomiting indigestion diarrhea abdominal pain constipation decreased appetite weight loss increased appetite weight gain sedation tremor ataxia headache nystagmus diplopia asterixis dysarthria impaired coordination alopecia hair loss skin rash erythema multiforme sjs emotional upset depression psychosis aggressions hyperactivity weakness thrombocytopenia hemorrhage bruising anemia bone marrow suppresion elevated transaminase bilirubin abnormal thyroid function amenorrhagia galactorrhea acute pancreatitis hyperammonemia hyperglycinemia edema extremities epilepsy epilepsy pediatric headache migraine headache bipolar disorder rheumatic fever acute'

**Add Stopword who is not in stopword module function**

In [None]:
#clean stopwords
stw = open("stopword.txt")
# Use this to read file content as a stream:
line = stw.read()
sw1 = line.split()
sw1

['x',
 'y',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'you',
 'yond',
 'yonder',
 'yon',
 'ye',
 'yet',
 'z',
 'zillion',
 '#####',
 'ddc',
 'dpnp',
 'gad',
 'mdd',
 'ggt',
 'ad',
 'well',
 'will',
 'who',
 'whom',
 'underlying',
 'condition',
 'conditions',
 'medical',
 'ad',
 'pt',
 'patient',
 'patients',
 'cont',
 'likely',
 'including',
 'follow',
 'required',
 'doesnt',
 'goes',
 'but',
 'useful',
 'want',
 'wants',
 'that',
 'thats',
 'require',
 'need',
 'needed',
 'received',
 'come',
 'asking',
 'giving',
 'give',
 'total',
 'developing',
 'still',
 'presumed',
 'believe',
 'later',
 'just',
 'earlier',
 'took',
 'details',
 'liked',
 'noticed',
 'fairly',
 'non',
 'didnt',
 'work',
 'wasnt',
 'www',
 'com',
 'consistent',
 'care',
 'called',
 'may',
 'possible',
 'suggest',
 'clinical',
 'new',
 'old',
 'family',
 'daughter',
 'son',
 'father',
 'mother',
 'husband',
 'unlikely',
 'excluded',
 'group',
 'ok',
 'being',
 '#####',
 'j',
 'u',
 'umpteen',
 'usually',
 'us

In [None]:
#Menghapus Stopword (Stopword removal)
data['add_swr'] = data['clean_sw'].apply(lambda x: " ". join(x for x in x.split() if x not in sw1))
data['add_swr']

0      alprazolam short term therapy moderate severe ...
1      lorazepam short term treatment anxiety anxiety...
2      clobazam associated anxiety tension sleep diso...
3      lorazepam anxiety disorders emotionally induce...
4      diazepam short term management anxiety adjunct...
                             ...                        
278    citicoline unconsciousness due brain damage he...
279    piracetam post traumatic symptoms involutional...
280    mecobalamin peripheral neuropathy hypersensiti...
281    piracetam involutional symptoms aging memory d...
282    citicoline unconsciousness due head trauma bra...
Name: add_swr, Length: 283, dtype: object

In [None]:
cek = data.loc[93, 'add_swr']
cek

'valproic acid monotherapy adjunctive therapy treatment partial seizures simple complex absence seizure petit mal seizures hypersensitivity hepatic disease hepatic dysfunction nausea vomiting indigestion diarrhea abdominal pain constipation decreased appetite weight loss increased appetite weight gain sedation tremor ataxia headache nystagmus diplopia asterixis dysarthria impaired coordination alopecia hair loss skin rash erythema multiforme sjs emotional upset depression psychosis aggressions hyperactivity weakness thrombocytopenia hemorrhage bruising anemia bone marrow suppresion elevated transaminase bilirubin abnormal thyroid function amenorrhagia galactorrhea acute pancreatitis hyperammonemia hyperglycinemia edema extremities epilepsy epilepsy pediatric headache migraine headache bipolar disorder rheumatic fever acute'

In [None]:
clean_stopwords = pd.DataFrame(data['add_swr'])

file_name = 'clean stopwords.xlsx'

# saving the excel
clean_stopwords.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


**Lemmatization**

In [None]:
nltk.download('wordnet')
wn= nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
def lemmatization(text):
    text = ' '.join(wn.lemmatize(word) for word in text.split() if word in text)
    return text

# Buat kolom tambahan untuk data description yang telah dilemmatization
data['desc_clean'] = data['add_swr'].apply(lemmatization)

In [None]:
data['desc_clean']

0      alprazolam short term therapy moderate severe ...
1      lorazepam short term treatment anxiety anxiety...
2      clobazam associated anxiety tension sleep diso...
3      lorazepam anxiety disorder emotionally induced...
4      diazepam short term management anxiety adjunct...
                             ...                        
278    citicoline unconsciousness due brain damage he...
279    piracetam post traumatic symptom involutional ...
280    mecobalamin peripheral neuropathy hypersensiti...
281    piracetam involutional symptom aging memory de...
282    citicoline unconsciousness due head trauma bra...
Name: desc_clean, Length: 283, dtype: object

In [None]:
cek = data.loc[93, 'desc_clean']
cek

'valproic acid monotherapy adjunctive therapy treatment partial seizure simple complex absence seizure petit mal seizure hypersensitivity hepatic disease hepatic dysfunction nausea vomiting indigestion diarrhea abdominal pain constipation decreased appetite weight loss increased appetite weight gain sedation tremor ataxia headache nystagmus diplopia asterixis dysarthria impaired coordination alopecia hair loss skin rash erythema multiforme sjs emotional upset depression psychosis aggression hyperactivity weakness thrombocytopenia hemorrhage bruising anemia bone marrow suppresion elevated transaminase bilirubin abnormal thyroid function amenorrhagia galactorrhea acute pancreatitis hyperammonemia hyperglycinemia edema extremity epilepsy epilepsy pediatric headache migraine headache bipolar disorder rheumatic fever acute'

In [None]:
clean_lemma = pd.DataFrame(data['desc_clean'])

file_name = 'clean lemma.xlsx'

# saving the excel
clean_lemma.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


**Tokenizing**

In [None]:
lemmatization = pd.DataFrame(data['desc_clean'])
token = nltk.tokenize.WhitespaceTokenizer().tokenize(lemmatization['desc_clean'][93])
token

['valproic',
 'acid',
 'monotherapy',
 'adjunctive',
 'therapy',
 'treatment',
 'partial',
 'seizure',
 'simple',
 'complex',
 'absence',
 'seizure',
 'petit',
 'mal',
 'seizure',
 'hypersensitivity',
 'hepatic',
 'disease',
 'hepatic',
 'dysfunction',
 'nausea',
 'vomiting',
 'indigestion',
 'diarrhea',
 'abdominal',
 'pain',
 'constipation',
 'decreased',
 'appetite',
 'weight',
 'loss',
 'increased',
 'appetite',
 'weight',
 'gain',
 'sedation',
 'tremor',
 'ataxia',
 'headache',
 'nystagmus',
 'diplopia',
 'asterixis',
 'dysarthria',
 'impaired',
 'coordination',
 'alopecia',
 'hair',
 'loss',
 'skin',
 'rash',
 'erythema',
 'multiforme',
 'sjs',
 'emotional',
 'upset',
 'depression',
 'psychosis',
 'aggression',
 'hyperactivity',
 'weakness',
 'thrombocytopenia',
 'hemorrhage',
 'bruising',
 'anemia',
 'bone',
 'marrow',
 'suppresion',
 'elevated',
 'transaminase',
 'bilirubin',
 'abnormal',
 'thyroid',
 'function',
 'amenorrhagia',
 'galactorrhea',
 'acute',
 'pancreatitis',
 'hy

In [None]:
#Mendefenisikan fungsi untuk tokenizing
import re
def tokenization(text):
    tokens = re.split('W+',text) #W+ means that either a word character (A-Za-z0-9) or a dash (-) can go there.
    return tokens
#Menerepakn fungsi pada kolom
data['desc_token']= data['desc_clean'].apply(lambda x: tokenization(x))

In [None]:
cek = data.loc[92, 'desc_token']
cek

['topiramate monotherapy newly diagnosed epilepsy conversion monotherapy epilepsy adjunctive therapy adult childn 2 year partial onset seizure generalized tonic clonic seizure adjunctive therapy seizure associated lennox gastaut syndrome migraine headache prophylaxis adult hypersensitivity somnolence dizziness fatigue irritability decreased weight bradyphrenia paresthesia diplopia abnormal coordination nausea nystagmus lethargy anorexia dysarthria blurred vision decreased appetite memory impairment diarrhea decreased appetite disturbance attention aggression rash abnormal behavior balance disorder constipation depression anxiety asthenia dysgeusia hypoesthesia pyrexia alopecia insomnia expressive language disorder epilepsy epilepsy pediatric headache migraine headache']

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283 entries, 0 to 282
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Nama_Obat          283 non-null    object
 1   Manufacturer       283 non-null    object
 2   Content            283 non-null    object
 3   Indications        283 non-null    object
 4   Contraindications  283 non-null    object
 5   Adverse_Reactions  283 non-null    object
 6   Diseases           283 non-null    object
 7   MIMS_Class         283 non-null    object
 8   Description        283 non-null    object
 9   description_clear  283 non-null    object
 10  clean_punct        283 non-null    object
 11  clean_double_ws    283 non-null    object
 12  clean_sw           283 non-null    object
 13  add_swr            283 non-null    object
 14  desc_clean         283 non-null    object
 15  desc_token         283 non-null    object
dtypes: object(16)
memory usage: 35.5+ KB


In [None]:
# Mengambil variabel yang akan digunakan
data_clear = data.iloc[:,[0,1,2,3,4,5,6,7,14]]
data_clear

Unnamed: 0,Nama_Obat,Manufacturer,Content,Indications,Contraindications,Adverse_Reactions,Diseases,MIMS_Class,desc_clean
0,Atarax,Mersifarma TM,Alprazolam,Short-term therapy of moderate or severe anxie...,Hypersensitivity to benzodiazepines. Acute nar...,"Drowsiness, ataxia/muscle weakness, amnesia, d...",Anxiety; Insomnia; Premenstrual Dysphoric Diso...,Anxiolytics,alprazolam short term therapy moderate severe ...
1,Merlopam,Mersifarma TM,Lorazepam,Short-term treatment of anxiety or anxiety-rel...,Narrow-angle glaucoma; severe resp insufficien...,"Sedation followed by dizziness, weakness & uns...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,lorazepam short term treatment anxiety anxiety...
2,Proclozam,Meprofarm,Clobazam,"Conditions associated with anxiety, tension, s...",Hypersensitivity. Myasthenia gravis. History o...,"Fatigue, dry mouth, constipation, loss of appe...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,clobazam associated anxiety tension sleep diso...
3,Renaquil,Fahrenheit,Lorazepam,"Anxiety-related disorders, emotionally-induced...",Primary depressive disorders; primary treatmen...,"Drowsiness, dizziness, clumsiness, sedation, t...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,lorazepam anxiety disorder emotionally induced...
4,Valdimex,Mersifarma TM,Diazepam,Short-term management of anxiety. Adjunct in t...,Resp depression; acute pulmonary insufficiency...,"Drowsiness, lightheadedness, ataxia, amnesia, ...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,diazepam short term management anxiety adjunct...
...,...,...,...,...,...,...,...,...,...
278,Plegicol,Ikapharmindo,Citicoline,"Unconsciousness due to brain damage, head inju...",Hypertonia of the parasympathetic nervous system.,"Epigastric distress, nausea; skin redness; hea...",Alzheimer's Disease and Dementia; Parkinson's ...,Peripheral Vasodilators & Cerebral Activators ...,citicoline unconsciousness due brain damage he...
279,Revolan,Sanbe,Piracetam,"Post-traumatic symptoms, involutional symptoms...",Hypersensitivity. Severe renal insufficiency C...,"Nervousness, irritability, insomnia, anxiety, ...",Epilepsy (Pediatric); Alzheimer's Disease and ...,Nootropics & Neurotonics/Neurotrophics,piracetam post traumatic symptom involutional ...
280,Rincobal,Yarindo Farmatama,Mecobalamin,Peripheral neuropathy.,Hypersensitivity.,"Reduced appetite, nausea, anorexia, vomiting, ...",Neuropathic Pain; Vertigo.,Nootropics & Neurotonics/Neurotrophics,mecobalamin peripheral neuropathy hypersensiti...
281,Sevotam 800,Ifars,Piracetam,"Involutional symptoms related to aging eg, mem...",Severe kidney function impairment (CrCl <20 mL...,"Tremor, insomnia, fatigue, drowsiness, anxiety...",Epilepsy (Pediatric); Alzheimer's Disease and ...,Nootropics & Neurotonics/Neurotrophics,piracetam involutional symptom aging memory de...


In [None]:
#Simpan dalam bentuk CSV
data_clear.to_csv("Data Clean.csv", sep=';')

#Membaca dalam bentuk CSV
import pandas as pd
data1 = pd.read_csv('Data Clean.csv', sep=';')
data1.head()

Unnamed: 0.1,Unnamed: 0,Nama_Obat,Manufacturer,Content,Indications,Contraindications,Adverse_Reactions,Diseases,MIMS_Class,desc_clean
0,0,Atarax,Mersifarma TM,Alprazolam,Short-term therapy of moderate or severe anxie...,Hypersensitivity to benzodiazepines. Acute nar...,"Drowsiness, ataxia/muscle weakness, amnesia, d...",Anxiety; Insomnia; Premenstrual Dysphoric Diso...,Anxiolytics,alprazolam short term therapy moderate severe ...
1,1,Merlopam,Mersifarma TM,Lorazepam,Short-term treatment of anxiety or anxiety-rel...,Narrow-angle glaucoma; severe resp insufficien...,"Sedation followed by dizziness, weakness & uns...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,lorazepam short term treatment anxiety anxiety...
2,2,Proclozam,Meprofarm,Clobazam,"Conditions associated with anxiety, tension, s...",Hypersensitivity. Myasthenia gravis. History o...,"Fatigue, dry mouth, constipation, loss of appe...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,clobazam associated anxiety tension sleep diso...
3,3,Renaquil,Fahrenheit,Lorazepam,"Anxiety-related disorders, emotionally-induced...",Primary depressive disorders; primary treatmen...,"Drowsiness, dizziness, clumsiness, sedation, t...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,lorazepam anxiety disorder emotionally induced...
4,4,Valdimex,Mersifarma TM,Diazepam,Short-term management of anxiety. Adjunct in t...,Resp depression; acute pulmonary insufficiency...,"Drowsiness, lightheadedness, ataxia, amnesia, ...",Anxiety; Insomnia; Epilepsy; Epilepsy (Pediatr...,Anxiolytics,diazepam short term management anxiety adjunct...


In [None]:
del data1["Unnamed: 0"]

# **TF IDF, COSINE dan REKOMENDASI ngram = 1,1**

**Count Vectorizer**


In [None]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(encoding='latin-1', ngram_range=(1,1),
                                  tokenizer=None, analyzer='word',
                                  stop_words= None)
countvec_1= count_vec.fit_transform(data1['desc_clean']).toarray()
countvec_1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

**Perhitungan TF**

In [None]:
countvec_2 = pd.DataFrame(countvec_1)
countvec_2


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1743,1744,1745,1746,1747,1748,1749,1750,1751,1752
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
281,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
# Mengambil kata-kata dalam semua dokumen
kata_kata_1= count_vec.get_feature_names() #Mapping array dari fitur integer ke fitur nama
kata_kata_1



['10',
 '100',
 '102',
 '12',
 '125',
 '13',
 '14',
 '15',
 '16',
 '160',
 '16x',
 '17',
 '18',
 '20',
 '200',
 '24',
 '25',
 '250',
 '2500',
 '30',
 '31',
 '325',
 '350',
 '37',
 '3a4',
 '40',
 '400',
 '50',
 '500',
 '5000',
 '54',
 '65',
 '73',
 '75',
 'abarticular',
 'abdomen',
 'abdominal',
 'abnormal',
 'abnormality',
 'abrasion',
 'abscess',
 'absence',
 'accelerate',
 'accelerated',
 'accelerates',
 'accidental',
 'accommodation',
 'accompanied',
 'accompany',
 'acetaminophen',
 'acetosal',
 'acetylcysteine',
 'acetylsalicylic',
 'ache',
 'acid',
 'acne',
 'acting',
 'activation',
 'active',
 'activity',
 'acuity',
 'acute',
 'adam',
 'adaptation',
 'adaptive',
 'add',
 'addiction',
 'addictive',
 'addison',
 'additive',
 'adenoidectomy',
 'adequate',
 'adhd',
 'adjacent',
 'adjunct',
 'adjunctive',
 'adjunctively',
 'adjuvant',
 'administration',
 'adolescent',
 'adult',
 'affective',
 'agent',
 'aggravated',
 'aggresion',
 'aggression',
 'aggressive',
 'aging',
 'agitation',
 

In [None]:
# Menambahkan kata-kata ke dalam data frame
countvec_3 = pd.DataFrame(countvec_1,columns = kata_kata_1)
countvec_3

Unnamed: 0,10,100,102,12,125,13,14,15,16,160,...,weight,withdrawal,woman,wound,yawn,yawning,year,zika,zonisamide,zoster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
281,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
#menyimpan file TF

file_name = 'TF(1,1).xlsx'

# saving the excel
countvec_3.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


**Perhitungan TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer= TfidfTransformer(norm=None, use_idf=True, smooth_idf=False, sublinear_tf=False)
TF_IDF = transformer.fit_transform(countvec_1)

In [None]:
tfidf_1 = TF_IDF.toarray()
tfidf_1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
tfidf_2 = pd.DataFrame(tfidf_1)
tfidf_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1743,1744,1745,1746,1747,1748,1749,1750,1751,1752
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,4.566005,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,3.119086,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,4.16054,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,3.119086,0.0,0.0,0.0
279,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
280,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
281,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,2.567909,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [None]:
kata_kata_2= count_vec.get_feature_names()



In [None]:
TF_IDF_1 = pd.DataFrame(tfidf_1, columns=kata_kata_2)
TF_IDF_1

Unnamed: 0,10,100,102,12,125,13,14,15,16,160,...,weight,withdrawal,woman,wound,yawn,yawning,year,zika,zonisamide,zoster
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,4.566005,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,3.119086,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,4.16054,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,3.119086,0.0,0.0,0.0
279,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
280,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
281,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,2.567909,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [None]:
# Membuat dataframe untuk melihat tf-idf matrix
# Kolom diisi dengan description
# Baris diisi dengan nama obat

TF_IDF1 = pd.DataFrame(
    TF_IDF.todense(),
    columns=count_vec.get_feature_names(),
    index=data1.Nama_Obat
)
TF_IDF1



Unnamed: 0_level_0,10,100,102,12,125,13,14,15,16,160,...,weight,withdrawal,woman,wound,yawn,yawning,year,zika,zonisamide,zoster
Nama_Obat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Atarax,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
Merlopam,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
Proclozam,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
Renaquil,0.0,0.0,0.0,4.566005,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,3.119086,0.0,0.0,0.0
Valdimex,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,4.16054,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plegicol,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,3.119086,0.0,0.0,0.0
Revolan,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
Rincobal,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
Sevotam 800,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,2.567909,0.00000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [None]:
file_name = 'TF-IDF(1,1).xlsx'

# saving the excel
TF_IDF1.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


**Perhitungan Cosine Similarity**



In [None]:
# Get Cosine Similarity
cosine_sim1 = cosine_similarity(TF_IDF,TF_IDF)
cosine_sim1

array([[1.        , 0.36666847, 0.17627579, ..., 0.00644793, 0.06322148,
        0.02499122],
       [0.36666847, 1.        , 0.372947  , ..., 0.04216799, 0.13825922,
        0.03608516],
       [0.17627579, 0.372947  , 1.        , ..., 0.03122101, 0.14482729,
        0.01476087],
       ...,
       [0.00644793, 0.04216799, 0.03122101, ..., 1.        , 0.0724042 ,
        0.04982917],
       [0.06322148, 0.13825922, 0.14482729, ..., 0.0724042 , 1.        ,
        0.08524367],
       [0.02499122, 0.03608516, 0.01476087, ..., 0.04982917, 0.08524367,
        1.        ]])

In [None]:
cos_sim = pd.DataFrame(cosine_sim1)
cos_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,273,274,275,276,277,278,279,280,281,282
0,1.000000,0.366668,0.176276,0.105710,0.268740,0.246951,0.288394,0.414549,0.213075,0.395200,...,0.012296,0.009347,0.005086,0.022738,0.052558,0.000000,0.071345,0.006448,0.063221,0.024991
1,0.366668,1.000000,0.372947,0.286170,0.310007,0.279218,0.274533,0.301581,0.234503,0.287244,...,0.016542,0.024393,0.017922,0.063172,0.173646,0.037776,0.141928,0.042168,0.138259,0.036085
2,0.176276,0.372947,1.000000,0.179690,0.288820,0.217488,0.217291,0.175911,0.221307,0.173902,...,0.028468,0.012121,0.009033,0.176066,0.102337,0.010982,0.130037,0.031221,0.144827,0.014761
3,0.105710,0.286170,0.179690,1.000000,0.207430,0.157247,0.147887,0.167635,0.173550,0.181717,...,0.039238,0.044177,0.012334,0.104074,0.066378,0.031234,0.104230,0.092222,0.099307,0.033690
4,0.268740,0.310007,0.288820,0.207430,1.000000,0.347394,0.362543,0.151174,0.168992,0.142407,...,0.044974,0.064506,0.004848,0.068114,0.118064,0.000000,0.131427,0.006145,0.126088,0.040236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0.000000,0.037776,0.010982,0.031234,0.000000,0.000000,0.000000,0.005858,0.003959,0.005157,...,0.024581,0.003926,0.012695,0.107592,0.106621,1.000000,0.130987,0.016093,0.073424,0.402143
279,0.071345,0.141928,0.130037,0.104230,0.131427,0.104027,0.116302,0.135990,0.101025,0.125852,...,0.081087,0.118433,0.022426,0.258730,0.539738,0.130987,1.000000,0.067396,0.495591,0.116341
280,0.006448,0.042168,0.031221,0.092222,0.006145,0.024100,0.033844,0.042548,0.008618,0.037456,...,0.516183,0.546597,0.573794,0.064890,0.048445,0.016093,0.067396,1.000000,0.072404,0.049829
281,0.063221,0.138259,0.144827,0.099307,0.126088,0.098944,0.109720,0.203806,0.082043,0.189328,...,0.046985,0.040260,0.057115,0.261246,0.416400,0.073424,0.495591,0.072404,1.000000,0.085244


In [None]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa nama obat
cos_sim_df = pd.DataFrame(cosine_sim1, index=data1['Nama_Obat'], columns=data1['Nama_Obat'])
print('Shape:', cos_sim_df.shape)

# Melihat similarity matrix pada setiap obat
cos_sim_df

Shape: (283, 283)


Nama_Obat,Atarax,Merlopam,Proclozam,Renaquil,Valdimex,Valisanbe,Valisanbe Injection,Xanax,Xiety,Zolysan,...,Megabal,Metifer,Mobafer,Noocetam,Nootrisol,Plegicol,Revolan,Rincobal,Sevotam 800,Simciti
Nama_Obat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Atarax,1.000000,0.366668,0.176276,0.105710,0.268740,0.246951,0.288394,0.414549,0.213075,0.395200,...,0.012296,0.009347,0.005086,0.022738,0.052558,0.000000,0.071345,0.006448,0.063221,0.024991
Merlopam,0.366668,1.000000,0.372947,0.286170,0.310007,0.279218,0.274533,0.301581,0.234503,0.287244,...,0.016542,0.024393,0.017922,0.063172,0.173646,0.037776,0.141928,0.042168,0.138259,0.036085
Proclozam,0.176276,0.372947,1.000000,0.179690,0.288820,0.217488,0.217291,0.175911,0.221307,0.173902,...,0.028468,0.012121,0.009033,0.176066,0.102337,0.010982,0.130037,0.031221,0.144827,0.014761
Renaquil,0.105710,0.286170,0.179690,1.000000,0.207430,0.157247,0.147887,0.167635,0.173550,0.181717,...,0.039238,0.044177,0.012334,0.104074,0.066378,0.031234,0.104230,0.092222,0.099307,0.033690
Valdimex,0.268740,0.310007,0.288820,0.207430,1.000000,0.347394,0.362543,0.151174,0.168992,0.142407,...,0.044974,0.064506,0.004848,0.068114,0.118064,0.000000,0.131427,0.006145,0.126088,0.040236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Plegicol,0.000000,0.037776,0.010982,0.031234,0.000000,0.000000,0.000000,0.005858,0.003959,0.005157,...,0.024581,0.003926,0.012695,0.107592,0.106621,1.000000,0.130987,0.016093,0.073424,0.402143
Revolan,0.071345,0.141928,0.130037,0.104230,0.131427,0.104027,0.116302,0.135990,0.101025,0.125852,...,0.081087,0.118433,0.022426,0.258730,0.539738,0.130987,1.000000,0.067396,0.495591,0.116341
Rincobal,0.006448,0.042168,0.031221,0.092222,0.006145,0.024100,0.033844,0.042548,0.008618,0.037456,...,0.516183,0.546597,0.573794,0.064890,0.048445,0.016093,0.067396,1.000000,0.072404,0.049829
Sevotam 800,0.063221,0.138259,0.144827,0.099307,0.126088,0.098944,0.109720,0.203806,0.082043,0.189328,...,0.046985,0.040260,0.057115,0.261246,0.416400,0.073424,0.495591,0.072404,1.000000,0.085244


In [None]:
file_name = 'cosim(1,1).xlsx'

# saving the excel
cos_sim.to_excel(file_name)
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


**Rekomendasi dengan Cosine**

In [None]:
data1tes = data1.reset_index()
titles = data1tes['Nama_Obat']

# membuat map dari index dan judul buku
indices = pd.Series(data1tes.index, index=data1tes['Nama_Obat']).drop_duplicates()

In [None]:
# TES Rekomendasi
# mengambil judul buku sebagai input dan output buku yang paling mirip
def rec_tfidf2(title, cosine_sim = cosine_sim1):
    recommendation = pd.DataFrame(columns = ['Nama_Obat', 'Score'])
    count = 0

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    obat_indices = [i[0] for i in sim_scores]

    for i in obat_indices:
        recommendation.at[count, 'Nama_Obat'] = titles.iloc[obat_indices[count]]
        recommendation.at[count, 'Score'] = sim_scores[count][1]
        count += 1
    return recommendation

In [None]:
# TES
rec_tfidf2('Paraco')

Unnamed: 0,Nama_Obat,Score
0,Naprex,0.845639
1,Pamol Suppository,0.833508
2,Sanmol/Sanmol Forte,0.818161
3,Turpas,0.816567
4,Glocetamol Forte,0.815275


In [None]:
# TES
hasil_rekomendasi = pd.DataFrame(rec_tfidf2('Paraco'))
hasil_rekomendasi

Unnamed: 0,Nama_Obat,Score
0,Naprex,0.845639
1,Pamol Suppository,0.833508
2,Sanmol/Sanmol Forte,0.818161
3,Turpas,0.816567
4,Glocetamol Forte,0.815275


In [None]:
# TES
# mengabungkan 2 tabel menggunakan metode merge
merged_table = pd.merge(hasil_rekomendasi, data1tes, on='Nama_Obat', how='left')

# hapus kolom unnamed: 0 dan index
merged_table.drop(columns=['index', 'desc_clean', 'MIMS_Class'], inplace=True)

# menampilkan hasil
merged_table

Unnamed: 0,Nama_Obat,Score,Manufacturer,Content,Indications,Contraindications,Adverse_Reactions,Diseases
0,Naprex,0.872654,Darya-Varia,Paracetamol,To reduce fever & provide pain relief.,Severe liver dysfunction.,"Hypersensitivity reactions, liver dysfunction.",Chronic Pelvic Pain; Chronic Pelvic Pain in Wo...
1,Praxion,0.846659,Pharos,Paracetamol (micronized),Relief of fever due to flu & after immunizatio...,Hepatic or renal failure.,"Hematological, skin & other allergic reactions.",Chronic Pelvic Pain; Chronic Pelvic Pain in Wo...
2,Turpas,0.842663,Simex,Paracetamol micronized,"Reduce fever, relief pain in headache & tootha...",Hypersensitivity. Severe liver dysfunction.,Prolonged & high dose may cause liver damage. ...,Chronic Pelvic Pain; Chronic Pelvic Pain in Wo...
3,Pamol Suppository,0.84134,Interbat,Paracetamol,To reduce pain & fever when oral administratio...,Severe hepatic dysfunction.,Hepatic damage with high doses & prolonged use...,Chronic Pelvic Pain; Chronic Pelvic Pain in Wo...
4,Paraco,0.818161,Coronet,Paracetamol,Relief of pains & reduce body temp associated ...,Hypersensitivity. Hepatic disturbance or sever...,Hypersensitivity reactions. Liver damage in pr...,Chronic Pelvic Pain; Chronic Pelvic Pain in Wo...
