### In this notebook we will implements a classification model & perform cosine similarity  

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
from numpy import dot
from numpy.linalg import norm

In [None]:
# reading the stop words list with pickle
with open ('stop_words.ob', 'rb') as fp:
    domain_stop_word = pickle.load(fp)

In [None]:
# read data file
file_path = 'diseases_with_description.csv'
df = pd.read_csv(file_path)
print(df.head())

                                         Description           D_Name
0  bone, muscle, ear, otitis, hearing, membrane, ...  musculoskeletal
1  ear, otitis, hearing, throat, sinusitis, bleed...         ear_nose
2  ventilation, oxygen, airway, copd, breathing, ...      respiratory


In [None]:
def clean_text_func(text):

    """ this function clean & pre-process the data  """

    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word:
            final_text = final_text + x  +" "
    return final_text

df['Description'] = df['Description'].apply(lambda x: clean_text_func(x))
df.head()

Unnamed: 0,Description,D_Name
0,bone muscle ear otitis hearing membrane bleedi...,musculoskeletal
1,ear otitis hearing throat sinusitis bleeding n...,ear_nose
2,ventilation oxygen airway copd breathing acido...,respiratory


# Words Embedding:

In [None]:
cv = CountVectorizer(stop_words="english")
cv_tfidf = TfidfVectorizer(stop_words="english")

X = cv.fit_transform(list(df.loc[:, 'Description' ]))
X_tfidf = cv_tfidf.fit_transform(list(df.loc[:, 'Description' ]))

In [None]:
df_cv = pd.DataFrame(X.toarray() , columns=cv.get_feature_names())
df_tfidf = pd.DataFrame(X_tfidf.toarray() , columns=cv_tfidf.get_feature_names())



In [None]:
print(df_cv.shape)
cosine = lambda v1 , v2 : dot(v1 , v2) / (norm(v1) * norm(v2))

(3, 1016)


### Cosine Similarity

In [None]:
new_text = ["dizziness loss of balance  vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes "]
new_text_cv = cv.transform(new_text).toarray()[0]
new_text_tfidf = cv_tfidf.transform(new_text).toarray()[0]

for chpter_number in range(int(df.shape[0])):
    print(f"This is chpter number : {chpter_number} ")
    print(f"Cosin cv :    { cosine( df_cv.iloc[chpter_number]  , new_text_cv )} ")
    print(f"Cosin TFIDF : { cosine( df_tfidf.iloc[chpter_number]  , new_text_tfidf) } ")

This is chpter number : 0 
Cosin cv :    0.0818902227600523 
Cosin TFIDF : 0.07304513144543733 
This is chpter number : 1 
Cosin cv :    0.11331668394168082 
Cosin TFIDF : 0.10928108877281124 
This is chpter number : 2 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 


### Implementing the classification model: LogisticRegression

In [None]:
df.columns

Index(['Description', 'D_Name'], dtype='object')

In [None]:
X_train = df.Description
y_train = df.D_Name

In [None]:
cv1 = CountVectorizer()
X_train_cv1 = cv1.fit_transform(X_train)
pd_cv1 = pd.DataFrame(X_train_cv1.toarray(), columns=cv1.get_feature_names())



In [None]:
lr = LogisticRegression()
lr.fit(X_train_cv1, y_train)

LogisticRegression()

In [None]:
X_test = "Difficulty sleeping or staying asleep Fever Fluid draining from ear  Loss of balance. Hearing difficulties. Ear pain"
cleaned_text = clean_text_func(X_test)

In [None]:
X_test_cv3  = cv1.transform([cleaned_text])
y_pred_cv3 = lr.predict(X_test_cv3)
print(y_pred_cv3)

['ear_nose']


### In conclusion our model is abel to predict the disease, that given by user