In [3]:
!pip install transformers



In [5]:
!export LC_ALL="en_US.UTF-8"
!export LANG="en_US.UTF-8"
!pip install pytextrank


Collecting pytextrank
  Downloading pytextrank-3.2.4-py3-none-any.whl (30 kB)
Collecting icecream>=2.1
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting graphviz>=0.13
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m678.7 kB/s[0m eta [36m0:00:00[0m:--:--[0m
Collecting asttokens>=2.0.1
  Downloading asttokens-2.2.1-py2.py3-none-any.whl (26 kB)
Collecting executing>=0.3.1
  Downloading executing-1.2.0-py2.py3-none-any.whl (24 kB)
Installing collected packages: executing, graphviz, asttokens, icecream, pytextrank
Successfully installed asttokens-2.2.1 executing-1.2.0 graphviz-0.20.1 icecream-2.1.3 pytextrank-3.2.4


In [18]:
!pip install gensim




In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
import gensim
from gensim.summarization import keywords as gensim_keywords

# Preprocessing function
def preprocess_tweet(tweet):
    tweet = str(tweet).lower()
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    tokens = tweet.split()
    return ' '.join(tokens)

# Load your tweet data from a CSV file
df = pd.read_csv('test.csv')

# Preprocess the dataset
df['processed_tweet'] = df['Text'].apply(preprocess_tweet)

# Get the list of preprocessed tweets
tweets = df['processed_tweet'].tolist()

# Fine-tuning BERT
class TweetDataset(Dataset):
    def __init__(self, tweets, tokenizer, max_len):
        self.tweets = tweets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.tweets)
        
    def __getitem__(self, item):
        tweet = self.tweets[item]
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Set up BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Create DataLoaders
tweet_dataset = TweetDataset(tweets, tokenizer, max_len=128)
data_loader = DataLoader(tweet_dataset, batch_size=16, shuffle=False)

# Extract embeddings
embeddings = []
with torch.no_grad():
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        embeddings.append(outputs.last_hidden_state[:, 0].cpu().numpy())

tweet_embeddings = np.vstack(embeddings)

# Clustering
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(tweet_embeddings)
cluster_labels = kmeans.labels_

# Keyword extraction using Gensim's TextRank
def extract_keywords(text, num_keywords=5):
    keywords = gensim_keywords(text, words=num_keywords, split=True, scores=False)
    return keywords

# Show clusters with keywords
for i in range(n_clusters):
    print(f'Cluster {i}:')
    for tweet, processed_tweet, label in zip(df['Text'], df['processed_tweet'], cluster_labels):
        if label == i:
            keywords = extract_keywords(processed_tweet, num_keywords=5)
            print(f'  {", ".join(keywords)}')


2023-04-21 15:55:18.102485: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-21 15:55:18.485552: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-21 15:55:18.485619: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-21 15:55:18.529473: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-21 15:55:19.368739: W tensorflow/stream_executor/platform/de

Cluster 0:
  coworkers compliment, lookbut little
  highfat, keto diet, mind like
  dieting yoyo
  
  goan drink, lool forming diet
  feel comfortable, new diet, gotta
  good whats
  loseweight
  accomplished physicians explain, best way
  wednesdaywellness
  diet spooniesthatlift, free chicken breasts
  discovered gut, add inches, diet
  week diet, factor, dite semaines
  diet plan
  
  scientists finally, high life expectancy
  forcefed fourbanger, dodge charger sedan
  primal fear, paleodietorganicallygrowngrassfed
  bread
  diet plan
  diet plan
  funniest thing, elizabeth chick going
  amp, diet fat, wedoact
  chik fil, cfaone
  says recent, circulating cancer cellsall
  shit tgunna raplifeshit follow, diet
  hiring dietary aide, job
  good diet, question whats, sad
  candy drawer, friend gives
  constant diet, amp
  bout sums
  real diet
  like, hardest thing, bread, garlic
  ago, alljuice diet
  great stuff, large guys, diet
  juice, diet, kids, life
  gotta start, lol
  amp, di