In [16]:
! pip install gensim nltk



In [18]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# nltk 데이터 다운로드 (최초 실행 시 필요)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 전처리 함수
def preprocess_text(text):
    # HTML 태그 제거
    text = re.sub(r'<.*?>', ' ', text)
    # 특수 문자 및 구두점 제거
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # 대소문자 변환
    text = text.lower()
    # 토큰화
    words = word_tokenize(text)
    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # 어간 추출 및 표제어 추출
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

# 입력 및 출력 디렉토리
input_dir = "C:\\Users\\jrnee\\Desktop\\whitepaper\\txt"
output_dir = "C:\\Users\\jrnee\\Desktop\\whitepaper\\txt_token"

# 출력 디렉토리가 존재하지 않으면 생성
os.makedirs(output_dir, exist_ok=True)

# 디렉토리 내의 모든 텍스트 파일을 읽어와 전처리 후 파일로 저장
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        input_filepath = os.path.join(input_dir, filename)
        output_filepath = os.path.join(output_dir, filename)
        
        with open(input_filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = preprocess_text(text)
            # 전처리된 텍스트를 문자열로 변환
            processed_text = ' '.join(tokens)
        
        with open(output_filepath, 'w', encoding='utf-8') as output_file:
            output_file.write(processed_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jrnee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jrnee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jrnee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
! pip install scipy==1.10.1

Collecting scipy==1.10.1
  Downloading scipy-1.10.1-cp311-cp311-win_amd64.whl.metadata (58 kB)
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/59.0 kB ? eta -:--:--
     -------------------------------- ----- 51.2/59.0 kB 525.1 kB/s eta 0:00:01
     -------------------------------------- 59.0/59.0 kB 518.5 kB/s eta 0:00:00
Downloading scipy-1.10.1-cp311-cp311-win_amd64.whl (42.2 MB)
   ---------------------------------------- 0.0/42.2 MB ? eta -:--:--
   ---------------------------------------- 0.2/42.2 MB 3.6 MB/s eta 0:00:12
   ---------------------------------------- 0.3/42.2 MB 3.5 MB/s eta 0:00:12
   ---------------------------------------- 0.5/42.2 MB 3.7 MB/s eta 0:00:12
    --------------------------------------- 0.6/42.2 MB 3.5 MB/s eta 0:00:12
    --------------------------------------- 0.8/42.2 MB 3.6 MB/s eta 0:00:12
    --------------------------------------- 1.0/42.2 MB 3.6 MB/s eta 0:00:12
   - --

  You can safely remove it manually.
  You can safely remove it manually.


In [2]:
#1번
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk

# nltk 데이터 다운로드 (최초 실행 시 필요)
nltk.download('punkt')

# 파일들이 저장된 디렉토리 경로
input_dir = "C:\\Users\\jrnee\\Desktop\\whitepaper\\txt_token"
output_dir = "C:\\Users\\jrnee\\Desktop\\whitepaper\\txt_vectors"

# 텍스트 전처리 함수
def preprocess_text(text):
    return word_tokenize(text.lower())

# 디렉토리 내의 모든 텍스트 파일을 읽어와 TaggedDocument로 변환
documents = []
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(input_dir, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = preprocess_text(text)
            documents.append(TaggedDocument(words=tokens, tags=[filename]))

# Doc2Vec 모델 학습
model = Doc2Vec(vector_size=100, min_count=1, epochs=40)  # 기본 설정 사용
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

# 벡터화 및 저장
os.makedirs(output_dir, exist_ok=True)

for document in documents:
    vector = model.infer_vector(document.words)
    vector_filename = os.path.join(output_dir, f"{document.tags[0]}.vec")
    with open(vector_filename, 'w', encoding='utf-8') as vector_file:
        vector_file.write(" ".join(map(str, vector)))

print("벡터화 및 저장이 완료되었습니다.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jrnee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


벡터화 및 저장이 완료되었습니다.


In [1]:
#dbscan해보자구
import os
import numpy as np
from sklearn.cluster import DBSCAN
import pandas as pd

# Directory containing the vector files
vector_dir = "C:\\Users\\jrnee\\Desktop\\whitepaper\\txt_vectors"

# Load vectors from files
vectors = []
filenames = []
for filename in os.listdir(vector_dir):
    if filename.endswith(".vec"):
        filepath = os.path.join(vector_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            vector = list(map(float, file.read().split()))
            vectors.append(vector)
            filenames.append(filename)

# Convert to numpy array
vectors = np.array(vectors)

# Apply DBSCAN
dbscan = DBSCAN(eps=25.0, min_samples=2)  # Adjust eps and min_samples as needed
clusters = dbscan.fit_predict(vectors)

# Create a DataFrame with filenames and their cluster assignments
results = pd.DataFrame({
    'filename': filenames,
    'cluster': clusters
})

# Save the results to a CSV file
output_path = "C:\\Users\\jrnee\\Desktop\\whitepaper\\dbscan_results2.csv"
results.to_csv(output_path, index=False)

# Display results
print("DBSCAN clustering results saved to", output_path)
print(results.head())


DBSCAN clustering results saved to C:\Users\jrnee\Desktop\whitepaper\dbscan_results2.csv
                                            filename  cluster
0         0x ZRX whitepapers - whitepaper.io.txt.vec        0
1    1inch 1INCH whitepapers - whitepaper.io.txt.vec        0
2  1irstcoin FST whitepapers - whitepaper.io.txt.vec        0
3     1World 1WO whitepapers - whitepaper.io.txt.vec        0
4     42-coin 42 whitepapers - whitepaper.io.txt.vec        0


In [3]:
# Analyze clusters
total_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
noise_points = list(clusters).count(-1)

print(f"Total clusters (excluding noise): {total_clusters}")
print(f"Number of noise points: {noise_points}")

Total clusters (excluding noise): 16
Number of noise points: 267
