In [None]:
u = pd.read_csv("users.csv")
g = pd.read_csv("graph.csv", float_precision='round_trip')

In [None]:
bots = u[u['label'] == 1]
bot_source = bots['source_user_id']
bot_tweets = g[g['source_user_id'].isin(bot_source)]
bot_embeddings = g[g['source_user_id'].isin(bot_source)]['tweet_embedding']
embeddings = np.stack(g['tweet_embedding'].values)


In [None]:
def aggregate_embeddings_for_cluster(user_id):
    user = g[g['source_user_id'] == user_id]
    if user.size != 0:
        embeddings = np.stack(user['tweet_embedding'].values)
    else:
        # if user did not make any tweets
        embeddings = np.zeros((1,100))
    return user_id, torch.mean(torch.from_numpy(embeddings), dim=0).numpy()

# Apply the function to each unique source_user_id
aggregated_embeddings = [aggregate_embeddings_for_cluster(user_id) for user_id in bot_source]

# Separate the source_user_ids and embeddings into two lists
source_ids, embeddings = zip(*aggregated_embeddings)

# Convert the embeddings to a numpy array for use in K-means
embeddings = np.vstack(embeddings)

In [None]:
from sklearn.cluster import KMeans

inertia = []
range_clusters = range(1, 15)

for n_clusters in range_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(embeddings)
    inertia.append(kmeans.inertia_)

import matplotlib.pyplot as plt

plt.plot(range_clusters, inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
n_clusters = 5  # Number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(embeddings)

bot_clusters = kmeans.predict(embeddings)

In [None]:
bot_clusters_ids = pd.DataFrame({'source_id': source_ids, 'cluster': bot_clusters})

In [None]:
bot_clusters_ids.to_csv('bot_clusters_ids.csv', index=False)

## Getting tweets from database

In [13]:
from pymongo import MongoClient

# Change your db and collection name accordingly
hostname = 'localhost'
port = 27017  
client = MongoClient(hostname, port)
db = client['bt4222']
tweets = db['tweets1']
file_num = 1 # change to which tweet.json you are using

In [14]:
# load the cluster dataset
bot_clusters = pd.read_csv("bot_clusters_ids.csv")

In [15]:
for cluster_num in range(5):
    cluster_ids = bot_clusters[bot_clusters['cluster'] == cluster_num]['source_id']
    cluster_ids = cluster_ids.tolist()
    pipeline = [
        {'$match': {'lang': 'en'}},
        {'$match': {'author_id': {'$in': cluster_ids}}},
        {'$project': {'_id': 0, 'author_id': 1, 'text': 1}}
    ]
    tweet_cursor = tweets.aggregate(pipeline)
    tweets_df = pd.DataFrame(list(tweet_cursor))
    with open(f"cluster_{cluster_num}_{file_num}.txt", 'w', encoding="utf-8") as f:
        for tweet in tweets_df['text']:
            f.write(tweet + '\n')
        print(f"Cluster {cluster_num} done")

Cluster 0 done
Cluster 1 done
Cluster 2 done
Cluster 3 done
Cluster 4 done
