In [1]:
from sklearn.datasets import fetch_20newsgroups
cat = ['alt.atheism', 'talk.religion.misc', 'comp.graphics',
       'sci.space']

dataset = fetch_20newsgroups(subset='all', categories=cat)
labels = dataset.target
label_names = dataset.target_names

In [9]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [17]:
import numpy as np
# from nltk.corpus import names
import re
import string

def preproc(data):
  url_re = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
  email_re = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')
  preproc_data = []
  for doc in data:
    # Remover cabeçalho de Email
    doc = re.sub(r'(From:\s+[^\n]+\n)', '', doc)
    doc = re.sub(r'(Subject:[^\n]+\n)', '', doc)
    doc = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', doc)
    doc = re.sub(r'(Last-modified:[^\n]+\n)', '', doc)
    doc = re.sub(r'(Version:[^\n]+\n)', '', doc)

    doc = doc.lower()
    doc = re.sub(url_re, '', doc)
    doc = re.sub(email_re, '', doc)
    doc = re.sub(f'[{re.escape(string.punctuation)}]', '', doc)
    doc = re.sub(r'(\d+)', ' ', doc)
    preproc_data.append(doc)
  return np.array(preproc_data)



def get_lemmatized_data(data: list) -> np.array:
  data_proc = []
  lemmatizer = WordNetLemmatizer()
  for doc in data:
    doc = doc.lower()
    doc_lem = ' '.join(lemmatizer.lemmatize(word) for word in doc.split())
    data_proc.append(doc_lem)
  return np.array(data_proc)

In [18]:
data_proc = preproc(dataset.data)
daat_lem = get_lemmatized_data(data_proc)
data_lem[0]

'line organization walla walla college line in article sn mozumder writes date wed apr gmt in article tammy r healy writes bobby i would like to take the liberty to quote from a christian writer named ellen g white i hope that what she said will help you to edit your remark in this group in the future do not set yourself a a standard do not make your opinion your view of duty your interpretation of scripture a criterion for others and in your heart condemn them if they do not come up to your ideal thought fromthe mount of blessing p i hope quoting this doesnt make the atheist gag but i think ellen white put it better than i could tammy point peace bobby mozumder my point is that you set up your view a the only way to believe saying that all eveil in this world is caused by atheism is ridiculous and counterproductive to dialogue in this newsgroups i see in your post a spirit of condemnation of the atheist in this newsgroup bacause they don t believe exactly a you do if youre here to try

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(max_df=0.5, min_df=2, stop_words='english')
vect = bow.fit_transform(data_lem)

In [25]:
from sklearn.cluster import KMeans

K = 4
kmeans = KMeans(n_clusters=K, random_state=42)
kmeans.fit(vect)



In [27]:
from collections import Counter
clusters = kmeans.labels_
Counter(clusters)

Counter({1: 3376, 0: 7, 2: 3, 3: 1})

In [28]:
bow.get_feature_names_out()

array(['aa', 'aah', 'aap', ..., 'zyda', 'zyxel', 'zz'], dtype=object)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')

vect_tfidf = tfidf.fit_transform(data_lem)
kmeans.fit(vect_tfidf)
clusters = kmeans.labels_
Counter(clusters)



Counter({0: 575, 1: 942, 3: 1739, 2: 131})

In [36]:
cluster_label = {i: labels[np.where(clusters == i)] for i in range(K)}
cluster_label

{0: array([0, 0, 3, 0, 0, 0, 0, 0, 3, 3, 3, 0, 3, 0, 0, 3, 3, 0, 3, 3, 0, 3,
        0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 3, 0, 3, 3, 0, 0,
        0, 0, 3, 3, 0, 0, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
        0, 3, 0, 0, 0, 3, 3, 0, 3, 0, 3, 0, 0, 3, 3, 0, 0, 3, 0, 0, 3, 0,
        3, 0, 3, 3, 0, 0, 0, 3, 0, 3, 0, 0, 3, 3, 0, 3, 0, 3, 0, 3, 3, 0,
        3, 0, 3, 3, 0, 0, 3, 3, 0, 0, 0, 3, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0,
        3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 3, 3, 3,
        0, 0, 0, 0, 3, 0, 3, 3, 0, 3, 0, 3, 0, 0, 0, 3, 3, 3, 0, 0, 3, 0,
        0, 0, 0, 0, 3, 3, 0, 0, 3, 0, 0, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 3,
        3, 3, 3, 0, 3, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0,
        0, 0, 0, 3, 3, 3, 0, 3, 0, 0, 0, 0, 3, 0, 3, 0, 3, 0, 3, 3, 3, 0,
        0, 3, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 3, 3, 3, 0,
        0, 3, 0, 3, 3, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 3, 3, 0, 3, 0,
        3, 0, 0, 0, 3, 0, 3, 0, 0, 

In [42]:
palavras = tfidf.get_feature_names_out()
centroids = kmeans.cluster_centers_

for cluster, index_list in cluster_label.items():
  counter = Counter(cluster_label[cluster])
  print(f'Cluster {cluster}: {len(index_list)} amostras')
  print("=========================")
  for label_index, count in sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True):
    print(f'{label_names[label_index]}: {count} amostras')
  print("=========================")
  print("Top 10 palavras:")
  for i in centroids[cluster].argsort()[-10:]:
    print(f"{palavras[i]} ")
  print("=========================")
  print("\n")


Cluster 0: 575 amostras
alt.atheism: 342 amostras
talk.religion.misc: 232 amostras
comp.graphics: 1 amostras
Top 10 palavras:
bible 
believe 
atheist 
say 
people 
religion 
wa 
christian 
jesus 
god 


Cluster 1: 942 amostras
comp.graphics: 856 amostras
sci.space: 70 amostras
alt.atheism: 9 amostras
talk.religion.misc: 7 amostras
Top 10 palavras:
nntppostinghost 
email 
bit 
format 
program 
university 
thanks 
graphic 
file 
image 


Cluster 2: 131 amostras
alt.atheism: 79 amostras
talk.religion.misc: 52 amostras
Top 10 palavras:
dont 
odwyer 
keith 
people 
immoral 
think 
value 
morality 
moral 
objective 


Cluster 3: 1739 amostras
sci.space: 917 amostras
alt.atheism: 369 amostras
talk.religion.misc: 337 amostras
comp.graphics: 116 amostras
Top 10 palavras:
think 
ha 
dont 
like 
university 
just 
nntppostinghost 
article 
space 
wa 


