In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import csv
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# 불용어 다운로드
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # 영어 불용어 집합
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data_dir = '/content/drive/MyDrive/논문/The Great Gatsby/Data_Gatsby/character_speech'
output_dir = '/content/drive/MyDrive/논문/The Great Gatsby/Data_Gatsby/character_speech/matrix'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
def create_cooccurrence_matrix(file_path):
    data = pd.read_csv(file_path)
    content = data['content']

    words_per_sentence = []
    for c in content:
      content_string = str(c)
      sentences = nltk.sent_tokenize(content_string)
      for sentence in sentences:
        cleaned_sentence = re.sub(r'[^\w\s]',' ', sentence)   #알파벳(숫자 포함)과 공백 외의 모든 특수문자를 빈 칸으로 대체하고 제거
        cleaned_sentence = re.sub(r'\d+', '', cleaned_sentence)  # 숫자 제거
        tokens = word_tokenize(cleaned_sentence.lower())  # 소문자로 변환하고 토큰화

        new_tokens = []
        for t in tokens:
          lemmatized_token = lemmatizer.lemmatize(t)
          if len(lemmatized_token) > 2 and lemmatized_token not in stop_words:
            new_tokens.append(lemmatized_token)

        words_per_sentence.append(new_tokens)

    all_words = set() 
    for words in words_per_sentence:
        all_words.update(words)

    # 단어들을 알파벳 순으로 정렬
    all_words = sorted(all_words)     

    matrix = pd.DataFrame(0, index = all_words, columns = all_words)

    for words in words_per_sentence:
        for i, word1 in enumerate(words):        
            for word2 in words[i+1:]:          
                matrix.loc[word1, word2] += 1
                matrix.loc[word2, word1] += 1

    return matrix

def save_matrix_to_csv(matrix, output_file, suffix="_matrix"):
    # Create the output file name by appending the suffix and .csv extension
    output_file = f"{output_file}{suffix}.csv"
    matrix.to_csv(output_file, encoding='utf-8')

In [None]:
# matrix.csv 파일 생성
for file_name in [f for f in os.listdir(data_dir) if f.endswith('.csv')]:
  print(f"Processing {file_name}")
  matrix = create_cooccurrence_matrix(os.path.join(data_dir, file_name))
  output_file = os.path.join(output_dir, file_name.split('.')[0])
  save_matrix_to_csv(matrix, output_file)

Processing Father_speech.csv
Processing AuntsandUncles_speech.csv
Processing man_speech.csv
Processing Tom_speech.csv
Processing Daisy_speech.csv
Processing Nick_speech.csv
Processing Jordan_speech.csv
Processing MrWilson_speech.csv
Processing MrsWilson_speech.csv
Processing oldman_speech.csv
Processing MrsMckee_speech.csv
Processing MrMckee_speech.csv
Processing Catherine_speech.csv
Processing elevatorboy_speech.csv
Processing twogirls_speech.csv
Processing thegirl_speech.csv
Processing Lucille_speech.csv
Processing othergirl_speech.csv
Processing MrMumble_speech.csv
Processing owlman_speech.csv
Processing Gatsby_speech.csv
Processing orchestraleader_speech.csv
Processing butler_speech.csv
Processing agirl_speech.csv
Processing wife1_speech.csv
Processing wife2_speech.csv
Processing wife3_speech.csv
Processing husband2_speech.csv
Processing husband3_speech.csv
Processing theman_speech.csv
Processing crowd_speech.csv
Processing youngladies_speech.csv
Processing policeman_speech.csv
Pro