-
Notifications
You must be signed in to change notification settings - Fork 6
/
preprocess.py
35 lines (28 loc) · 1.07 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import difflib
import pandas as pd
def word_extractor(sentence):
tokenizer = RegexpTokenizer(r'\w+')
tokens =tokenizer.tokenize(sentence)
tokens=[token.lower() for token in tokens]
tokens = [token for token in tokens if not token in stopwords.words()]
return tokens
def symptoms(symptoms):
final_symptoms = []
final_symptoms_flat = []
df_train = pd.read_csv('dataset/Training.csv', delimiter=',')
vocab = df_train.columns.tolist()
for symptom in symptoms:
final_symptoms.append(difflib.get_close_matches(symptom, vocab, cutoff=0.6))
for sublist in final_symptoms:
for item in sublist:
final_symptoms_flat.append(item)
return set(final_symptoms_flat)
#print(symptoms(word_extractor(input("Describe your symptoms: "))))
#from predict import predictor
#print(predictor(symptoms(word_extractor(input("Describe your symptoms: ")))))