In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install nltk pandas openpyxl



In [None]:
import os
import nltk
from collections import Counter
import string
import pandas as pd
from tqdm import tqdm

def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(text)
    return words

def remove_stopwords(words):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    return [word for word in words if word not in stopwords]

def get_top_keywords(words, n=20):
    counter = Counter(words)
    return [keyword for keyword, freq in counter.most_common(n)]

def process_files_in_directory(directory_path):
    results = []
    for filename in tqdm(os.listdir(directory_path)):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            text = load_text(file_path)
            words = preprocess_text(text)
            words = remove_stopwords(words)
            top_keywords = get_top_keywords(words)

            result_row = [filename] + top_keywords
            results.append(result_row)

    return results

def main(directory_path, output_file):
    nltk.download('punkt')
    nltk.download('stopwords')

    results = process_files_in_directory(directory_path)

    column_names = ['Filename'] + [f'Keyword_{i}' for i in range(1, 21)]

    df = pd.DataFrame(results, columns=column_names)

    df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}")
directory_path = '/content/drive/MyDrive/model/txt_token'
output_file = 'top_keywords.xlsx'
main(directory_path, output_file)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1373/1373 [00:58<00:00, 23.58it/s]


Results saved to top_keywords.xlsx


In [None]:
def read_excel_file(file_path):
    df = pd.read_excel(file_path)
    return df

file_path = 'top_keywords.xlsx'
df = read_excel_file(file_path)
df


Unnamed: 0,Filename,Keyword_1,Keyword_2,Keyword_3,Keyword_4,Keyword_5,Keyword_6,Keyword_7,Keyword_8,Keyword_9,...,Keyword_11,Keyword_12,Keyword_13,Keyword_14,Keyword_15,Keyword_16,Keyword_17,Keyword_18,Keyword_19,Keyword_20
0,Delphy DPY whitepapers - whitepaper.io.txt,,,,,,,,,,...,,,,,,,,,,
1,DecentBet DBET whitepapers - whitepaper.io.txt,house,bet,platform,decent,user,credit,game,would,dbet,...,profit,session,casino,gambling,blockchain,ethereum,page,based,contract,sport
2,Daneel DAN whitepapers - whitepaper.io.txt,daneel,token,io,www,whitepaper,information,user,cryptocurrency,investor,...,service,blockchain,platform,sale,company,use,intelligence,investment,data,assistant
3,DATA DTA whitepapers - whitepaper.io.txt,data,token,node,user,blockchain,foundation,ad,mobile,decentralized,...,ai,trust,based,powered,alliance,whitepaper,network,n,fraud,issuer
4,Darcrus MER whitepapers - whitepaper.io.txt,data,public,use,dappsheet,company,customer,private,site,bet,...,platform,node,version,darcrus,case,release,date,november,revised,december
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,DAD DAD whitepapers - whitepaper.io.txt,user,advertising,dad,blockchain,metaverse,industry,chain,game,advertiser,...,new,content,token,data,development,based,also,number,information,delivery
1369,DAEX DAX whitepapers - whitepaper.io.txt,clearing,daex,exchange,asset,technology,user,cryptocurrency,distributed,based,...,trading,wallet,solution,service,whitepaper,chain,blockchain,technical,v,digital
1370,Decimal DEL whitepapers - whitepaper.io.txt,coin,decimal,token,del,transaction,user,blockchain,network,project,...,contract,service,number,chain,http,custom,validator,reserve,issue,blockchains
1371,DigitalBits XDB whitepapers - whitepaper.io.txt,digitalbits,token,program,asset,server,transaction,loyalty,network,goal,...,blockchain,compliance,federation,db,account,http,hotelbrand,point,digital,system


In [None]:
def read_excel_file(file_path):
    df = pd.read_excel(file_path)
    return df

file_path = 'top_keywords.xlsx'
df = read_excel_file(file_path)

output_file_path = '/content/drive/MyDrive/model/aved_keywords.xlsx'
df.to_excel(output_file_path, index=False)

output_file_path


'/content/drive/MyDrive/model/aved_keywords.xlsx'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import numpy as np
def read_excel_file(file_path):
    df = pd.read_excel(file_path)
    return df

file_path = 'top_keywords.xlsx'
df = read_excel_file(file_path)

df['keywords'] = df.apply(lambda row: ' '.join([str(row[f'Keyword_{i}']) for i in range(1, 21)]), axis=1)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['keywords'])

dbscan = DBSCAN(eps=0.5, min_samples=2, metric='cosine')
clusters = dbscan.fit_predict(X)

df['Cluster'] = clusters
output_file_path = '/content/drive/MyDrive/model/keywords_with_clusters.xlsx'
df.to_excel(output_file_path, index=False)

output_file_path


'/content/drive/MyDrive/model/saved_keywords_with_clusters.xlsx'