In [12]:
import pandas as pd
import glob

# This is where my CSV files are
path = 'Outputs/*.csv'

# Combining all CSV files into one DataFrame
all_files = glob.glob(path)
df_list = []

for filename in all_files:
    df = pd.read_csv(filename)
    df_list.append(df)



In [13]:
df_list

[    Category                                            Content  \
 0   Business  As the world’s largest — and one of its most u...   
 1   Business  Air safety officials in the United States are ...   
 2   Business  – When Aamir Dhedhi took his mother to India i...   
 3   Business  Amazon has announced plans to invest $9bn in S...   
 4   Business  Elon Musk suggested testing Tesla’s full self-...   
 5   Business  Air India was once so renowned for its service...   
 6   Business  – The Wall Street Journal has announced staff ...   
 7   Business  European Union chief Ursula von der Leyen has ...   
 8   Business  ByteDance, the owner of the  , has filed a law...   
 9   Business  Toyota has reported record profit and sales fi...   
 10  Business  ​​Australia’s flagship airline Qantas has agre...   
 11  Business  Boeing has called off the inaugural crewed fli...   
 12  Business  Ever since he can remember, Rohit Kumar Sahu k...   
 13  Business  The United States has revoked som

In [14]:
# Concatenating all DataFrames
combined_df = pd.concat(df_list, ignore_index=True)

In [15]:
# Viewing my combined dataset
combined_df

Unnamed: 0,Category,Content,Link,Source,Title
0,Business,As the world’s largest — and one of its most u...,https://www.aljazeera.com/economy/2024/5/8/a-n...,Al Jazeera,"Should India take from the rich, give the poor..."
1,Business,Air safety officials in the United States are ...,https://www.aljazeera.com/economy/2024/5/7/us-...,Al Jazeera,US officials probe allegations Boeing workers ...
2,Business,– When Aamir Dhedhi took his mother to India i...,https://www.aljazeera.com/news/2024/5/8/pakist...,Al Jazeera,Pakistan bets on a cannabis high as its econom...
3,Business,Amazon has announced plans to invest $9bn in S...,https://www.aljazeera.com/economy/2024/5/7/ama...,Al Jazeera,Amazon to invest $9bn in Singapore to expand c...
4,Business,Elon Musk suggested testing Tesla’s full self-...,https://www.aljazeera.com/economy/2024/5/8/elo...,Al Jazeera,"Elon Musk floated robotaxi launch in China, Ch..."
...,...,...,...,...,...
158,Sports,Denver Nuggets star Nikola Jokic was named the...,https://www.nbcnews.com/news/sports/nuggets-st...,NBC,Nuggets star Nikola Jokic wins 2024 NBA MVP aw...
159,Sports,INDIANAPOLIS — Indianapolis police announced W...,https://www.nbcnews.com/news/sports/milwaukee-...,NBC,Police investigating after Milwaukee Bucks pla...
160,Sports,An Indianapolis sports columnist will not be c...,https://www.nbcnews.com/news/us-news/indystar-...,NBC,IndyStar columnist won't cover Caitlin Clark a...
161,Sports,"Three-time , 34, is making his way from to ...",https://www.nbcnews.com/pop-culture/pop-cultur...,NBC,Travis Kelce lands first major acting role in ...


### Preparing Data For Classification


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000)

# Fitting and transforming the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df['Category'].values.astype('U'))  


### Applying Clustering Algorithm

In [18]:
from sklearn.cluster import KMeans
import pickle

num_clusters = 4

# Initializing K-Means clustering
km = KMeans(n_clusters=num_clusters)

# Fittting K-Means clustering on the TF-IDF matrix
km.fit(tfidf_matrix)

#saving as a pickle 
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(km, f)

### Assigning clusters to Dataframe

In [20]:
km.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 3, 3, 3, 3, 3, 3,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2])

In [19]:
# Adding cluster labels to DataFrame
combined_df['Cluster'] = km.labels_

In [9]:
combined_df.head()

Unnamed: 0,Category,Content,Link,Source,Title,Cluster
0,Business,As the world’s largest — and one of its most u...,https://www.aljazeera.com/economy/2024/5/8/a-n...,Al Jazeera,"Should India take from the rich, give the poor...",2
1,Business,Air safety officials in the United States are ...,https://www.aljazeera.com/economy/2024/5/7/us-...,Al Jazeera,US officials probe allegations Boeing workers ...,2
2,Business,– When Aamir Dhedhi took his mother to India i...,https://www.aljazeera.com/news/2024/5/8/pakist...,Al Jazeera,Pakistan bets on a cannabis high as its econom...,2
3,Business,Amazon has announced plans to invest $9bn in S...,https://www.aljazeera.com/economy/2024/5/7/ama...,Al Jazeera,Amazon to invest $9bn in Singapore to expand c...,2
4,Business,Elon Musk suggested testing Tesla’s full self-...,https://www.aljazeera.com/economy/2024/5/8/elo...,Al Jazeera,"Elon Musk floated robotaxi launch in China, Ch...",2


### Saving the dataframe with clusters

In [39]:
combined_df.to_csv('combined_data.csv')

In [40]:
cluster_categories = []
for cluster_id in range(num_clusters):
    cluster_articles = combined_df[combined_df['Cluster'] == cluster_id]
    most_common_category = cluster_articles['Category'].mode().values[0]
    cluster_categories.append(most_common_category)

In [41]:
cluster_categories

['Business', 'Politics', 'Sports', 'Entertainment']

### Testing clustering

In [42]:
# Printing number of articles in each cluster
print(combined_df['Cluster'].value_counts())

Cluster
1    56
2    39
0    36
3    32
Name: count, dtype: int64


In [38]:
for cluster_id in range(num_clusters):
    cluster_category = cluster_categories[cluster_id]
    print(f"Cluster {cluster_id}: {cluster_category}")
    print(combined_df[(combined_df['Cluster'] == cluster_id) & (combined_df['Category'] == cluster_category)].head(5))
    print("\n")

Cluster 0: Business
   Category                                            Content  \
0  Business  As the world’s largest — and one of its most u...   
1  Business  Air safety officials in the United States are ...   
2  Business  – When Aamir Dhedhi took his mother to India i...   
3  Business  Amazon has announced plans to invest $9bn in S...   
4  Business  Elon Musk suggested testing Tesla’s full self-...   

                                                Link      Source  \
0  https://www.aljazeera.com/economy/2024/5/8/a-n...  Al Jazeera   
1  https://www.aljazeera.com/economy/2024/5/7/us-...  Al Jazeera   
2  https://www.aljazeera.com/news/2024/5/8/pakist...  Al Jazeera   
3  https://www.aljazeera.com/economy/2024/5/7/ama...  Al Jazeera   
4  https://www.aljazeera.com/economy/2024/5/8/elo...  Al Jazeera   

                                               Title  Cluster  
0  Should India take from the rich, give the poor...        0  
1  US officials probe allegations Boeing worke