-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
39 lines (30 loc) · 1.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
from dotenv import load_dotenv
import pandas as pd
from src.preprocess.preprocess import preprocess_dataframe
from src.vectorization.vectorization import vectorize_text
from src.evaluate.elbow_method import elbow_method
from src.clustering.clustering import clusterize
from src.utils import get_top_words_per_cluster
def vectorize_df(df):
vectorized, vectorizer = vectorize_text(df['full_text'])
return vectorized, vectorizer
def main():
# Load config
load_dotenv()
# Preprocessing data
dataframe = preprocess_dataframe(
pd.read_csv(os.getenv('NEWS_CSV_FILE')),
title_label=os.getenv('NEWS_TITLE_COLUMN'),
content_label=os.getenv('NEWS_CONTENT_COLUMN')
)
# Processing
vectorized_dataframe, dataframe_vectorizer = vectorize_text(dataframe['full_text'])
best_k = elbow_method(vectorized_dataframe, 12)
clusters, kmeans = clusterize(vectorized_dataframe, best_k)
dataframe['cluster'] = clusters
# Outputs
get_top_words_per_cluster(kmeans, dataframe_vectorizer)
print(dataframe[[os.getenv('NEWS_TITLE_COLUMN'), 'cluster']].head())
if __name__ == "__main__":
main()