forked from Valzavator/MLProcess
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ML_process.py
56 lines (46 loc) · 1.89 KB
/
ML_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import math
import matplotlib.pyplot as plt
import nltk
import scipy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
from wordcloud import WordCloud
nltk.download('stopwords')
def prepare_messages(messages: list) -> list:
processed = []
for message in messages:
words = word_tokenize(str.lower(message))
tokens = [Word(word).lemmatize("v") for word in words
if word.isalpha() and word not in stopwords.words("english")]
if len(tokens) > 0:
print(tokens)
processed.append(tokens)
return [' '.join(word for word in sentences) for sentences in processed]
def vectorize_messages(messages: list):
vectorizer = TfidfVectorizer()
return vectorizer.fit_transform(messages)
def clusterize(count_of_clusters: int, matrix: scipy.matrix):
model = KMeans(n_clusters=count_of_clusters, random_state=0, n_jobs=-2)
model.fit(matrix)
return model.predict(matrix)
def build_clouds(model, count_of_clusters: int, messages, filepath: str):
result_cloud = plt.figure(figsize=(40, 30))
for num_of_cluster in range(count_of_clusters):
text = ""
print("\nCLUSTER: " + str(num_of_cluster) + "\n")
for idx, cluster in enumerate(model):
if num_of_cluster == cluster:
print(cluster, "-", idx, "-", messages[idx])
text += messages[idx] + " "
if len(text) > 0:
word_cloud = WordCloud(background_color='white').generate(text)
width = math.ceil(math.sqrt(count_of_clusters))
height = round(math.sqrt(count_of_clusters))
result_cloud.add_subplot(height, width, num_of_cluster + 1)
plt.axis('off')
plt.imshow(word_cloud)
plt.savefig(filepath)
plt.show()