In [11]:
"""
M21W0B09 入谷雄介
Data mining 13 class task
"""
import csv
import glob
import itertools
import math
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, preprocessing
from matplotlib import pyplot as plt

stop_word_list = [i[0] for i in csv.reader(open('stop_word.csv', 'r'))]
vectorizer = TfidfVectorizer()


class IdfUtil:
	@staticmethod
	def get_all_documents():
		file_name_list = glob.glob('21Documents/*.txt')
		file_name_list.sort()
		return file_name_list

	@staticmethod
	def read_21documents(file_name) -> str:
		word_list, tmp = [], []
		with open(file_name, 'r') as f:
			text = f.read()
			tmp.append([i for i in text.split(' ') if 0 < len(i)])
		for word in list(itertools.chain(*tmp)):
			if word not in stop_word_list:
				word_list.append(word)
		result_text = ' '.join(word_list)
		return result_text


class WordVector(IdfUtil):
	def __init__(self):
		self.generate_word_vector()

	@classmethod
	def generate_word_vector(cls):
		for file_name in cls.get_all_documents():
			print('=' * 50)
			print(f' File name = {file_name}')
			print('=' * 50)
			text = cls.read_21documents(file_name)
			text_corpus = text.split('\n')
			print(text_corpus)
			X = vectorizer.fit_transform(text_corpus)
			print(vectorizer.get_feature_names())
			print(X.toarray())
			print('\n')


class IDF(IdfUtil):
	def __init__(self):
		self.calc_idf()

	@classmethod
	def calc_idf(cls):
		idf_dict = {}
		document_list = cls.get_all_documents()
		for file_name in document_list:
			text = cls.read_21documents(file_name)
			text_corpus = text.split('\n')
			vectorizer.fit_transform(text_corpus)
			for word in vectorizer.get_feature_names():
				if word not in idf_dict:
					idf_dict[word] = 0
				idf_dict[word] += 1
		for key, value in idf_dict.items():
			idf_dict[key] = round(len(document_list) / value, 2)
		print('=' * 30, 'Inverse Document Frequency', '=' * 30)
		print(idf_dict)


class FreqDamp(IdfUtil):
	def __init__(self):
		self.calc_freq_damp()

	@classmethod
	def calc_freq_damp(cls):
		freq_damp_dict = {}
		document_list = cls.get_all_documents()
		for file_name in document_list:
			text = cls.read_21documents(file_name)
			text_corpus = text.split('\n')
			vectorizer.fit_transform(text_corpus)
			for word in vectorizer.get_feature_names():
				if word not in freq_damp_dict:
					freq_damp_dict[word] = 0
				freq_damp_dict[word] += 1
		for key, value in freq_damp_dict.items():
			freq_damp_dict[key] = round(value / len(document_list), 2)
		print('=' * 30, 'Frequency Damping', '=' * 30)
		print(freq_damp_dict)


class NormalizedFreq(IdfUtil):
	@classmethod
	def calc_normalized_freq(cls, file_name):
		idf_dict = {}
		text = cls.read_21documents(file_name)
		unique_words = text.split(' ')
		text_corpus = text.split('\n')
		N = len(text_corpus)
		for unique_word in unique_words:
			count = 0
			for sen in text_corpus:
				if unique_word in sen.split():
					count = count + 1
				idf_dict[unique_word] = (math.log((1 + N) / (count + 1))) + 1
		return idf_dict


class CosineSimilarity(IdfUtil):
	@classmethod
	def calc_cosine_similarity(cls, doc_pair):
		docs = [cls.read_21documents(doc_pair[0]), cls.read_21documents(doc_pair[1])]
		vectorizer = TfidfVectorizer(max_df=0.9)
		X = vectorizer.fit_transform(docs)
		print('feature_names:', vectorizer.get_feature_names())
		print('X:')
		print(X.toarray())
		sim = cosine_similarity(X)  # 類似度行列の作成
		for from_id in range(len(docs)):
			print('doc_id:', from_id)
			for to_id in range(len(docs)):
				print('\tsim[{0}][{1}] = {2:f}'.format(
					from_id, to_id, sim[from_id][to_id]))


class Jaccard(IdfUtil):
	@classmethod
	def jaccard(cls, doc1, doc2):
		data1 = cls.read_21documents(doc1).split(' ')
		data2 = cls.read_21documents(doc2).split(' ')
		items = 0
		for item in data1:
			if item in data2:
				items += 1
		print('=' * 50)
		print(doc1, doc2)
		print(items / (len(data1) + len(data2) - items))


class Kmeans(IdfUtil):
	def __init__(self, cluster_num):
		self.docs = self.get_all_documents()
		self.doc_num = len(self.docs)
		self.cluster_num = cluster_num
		self.result_df = pd.DataFrame(columns=['vec'])
		self.convert_vector()

	def separate_kmeans_clusters(self, input_docs):
		for doc_name in input_docs:
			X = self.result_df.loc[doc_name][0]
			sc = preprocessing.StandardScaler()
			sc.fit(X)
			X_norm = sc.transform(X)
			cls = KMeans(n_clusters=self.cluster_num)
			result = cls.fit(X_norm)
			print(result.labels_)

	def convert_vector(self):
		for doc in self.get_all_documents():
			text = self.read_21documents(doc)
			text_corpus = text.split('\n')
			vectorizer.fit_transform(text_corpus)
			X = vectorizer.transform(text_corpus)
			self.result_df.loc[doc] = [X.toarray()]


In [12]:
# (1)
WordVector()

 File name = 21Documents/DOC01.txt
['Software engineering systematic engineering approach software development ', 'A software engineer person applies principles software engineering design, develop, maintain, test, evaluate computer software. The term programmer sometimes used synonym, may also lack connotations engineering education skills.', 'Engineering techniques used inform[clarification needed] software development process involves definition, implementation, assessment, measurement, management, change, improvement software life cycle process itself. It heavily uses software configuration management systematically controlling changes configuration, maintaining integrity traceability configuration code throughout system life cycle. Modern processes use software versioning.', '']
['also', 'applies', 'approach', 'assessment', 'change', 'changes', 'clarification', 'code', 'computer', 'configuration', 'connotations', 'controlling', 'cycle', 'definition', 'design', 'develop', 'developm



<__main__.WordVector at 0x147ec2fa0>

In [13]:
# (2)
IDF()

{'also': 2.62, 'applies': 10.5, 'approach': 5.25, 'assessment': 21.0, 'change': 5.25, 'changes': 10.5, 'clarification': 21.0, 'code': 10.5, 'computer': 2.62, 'configuration': 10.5, 'connotations': 21.0, 'controlling': 21.0, 'cycle': 21.0, 'definition': 4.2, 'design': 5.25, 'develop': 3.5, 'development': 1.75, 'education': 10.5, 'engineer': 10.5, 'engineering': 3.0, 'evaluate': 21.0, 'heavily': 21.0, 'implementation': 10.5, 'improvement': 10.5, 'inform': 21.0, 'integrity': 21.0, 'involves': 3.5, 'it': 2.62, 'itself': 21.0, 'lack': 21.0, 'life': 7.0, 'maintain': 21.0, 'maintaining': 21.0, 'management': 7.0, 'may': 7.0, 'measurement': 10.5, 'modern': 7.0, 'needed': 21.0, 'person': 21.0, 'principles': 5.25, 'process': 4.2, 'processes': 2.33, 'programmer': 21.0, 'skills': 10.5, 'software': 3.0, 'sometimes': 10.5, 'synonym': 21.0, 'system': 7.0, 'systematic': 4.2, 'systematically': 21.0, 'techniques': 7.0, 'term': 4.2, 'test': 21.0, 'the': 1.62, 'throughout': 7.0, 'traceability': 21.0, 'use'

<__main__.IDF at 0x147ec2970>

In [14]:
# (3)
FreqDamp()

{'also': 0.38, 'applies': 0.1, 'approach': 0.19, 'assessment': 0.05, 'change': 0.19, 'changes': 0.1, 'clarification': 0.05, 'code': 0.1, 'computer': 0.38, 'configuration': 0.1, 'connotations': 0.05, 'controlling': 0.05, 'cycle': 0.05, 'definition': 0.24, 'design': 0.19, 'develop': 0.29, 'development': 0.57, 'education': 0.1, 'engineer': 0.1, 'engineering': 0.33, 'evaluate': 0.05, 'heavily': 0.05, 'implementation': 0.1, 'improvement': 0.1, 'inform': 0.05, 'integrity': 0.05, 'involves': 0.29, 'it': 0.38, 'itself': 0.05, 'lack': 0.05, 'life': 0.14, 'maintain': 0.05, 'maintaining': 0.05, 'management': 0.14, 'may': 0.14, 'measurement': 0.1, 'modern': 0.14, 'needed': 0.05, 'person': 0.05, 'principles': 0.19, 'process': 0.24, 'processes': 0.43, 'programmer': 0.05, 'skills': 0.1, 'software': 0.33, 'sometimes': 0.1, 'synonym': 0.05, 'system': 0.14, 'systematic': 0.24, 'systematically': 0.05, 'techniques': 0.14, 'term': 0.24, 'test': 0.05, 'the': 0.62, 'throughout': 0.14, 'traceability': 0.05, '

<__main__.FreqDamp at 0x147eeb040>

In [15]:
target_doc1 = ('21Documents/DOC01.txt', '21Documents/DOC02.txt')
target_doc2 = ('21Documents/DOC01.txt', '21Documents/DOC15.txt')
target_doc3 = ('21Documents/DOC20.txt', '21Documents/DOC21.txt')


In [16]:
# (4)
for target_doc in [target_doc1, target_doc2, target_doc3]:
    CosineSimilarity.calc_cosine_similarity(target_doc1)

feature_names: ['almost', 'also', 'always', 'application', 'applies', 'as', 'assessment', 'back', 'best', 'branch', 'can', 'carefully', 'changes', 'changing', 'clarification', 'code', 'complete', 'computer', 'configuration', 'connotations', 'controlling', 'cycle', 'deals', 'defines', 'design', 'develop', 'disciplined', 'does', 'doesn', 'education', 'effective', 'efficient', 'engineer', 'engineered', 'evaluate', 'everything', 'explain', 'fully', 'go', 'heavily', 'honed', 'ieee', 'implementation', 'improvement', 'inform', 'integrity', 'involves', 'is', 'itself', 'lack', 'lead', 'leads', 'life', 'machines', 'maintain', 'maintaining', 'maintenance', 'management', 'may', 'meaning', 'measurement', 'methods', 'modern', 'needed', 'needs', 'operates', 'operation', 'person', 'practices', 'process', 'processes', 'product', 'production', 'products', 'programmer', 'quantifiable', 'real', 'reliable', 'set', 'situations', 'skills', 'so', 'sometimes', 'stage', 'synonym', 'system', 'systematically', 't

In [17]:
# (5)
for target_doc in [target_doc1, target_doc2, target_doc3]:
    Jaccard.jaccard(target_doc[0], target_doc[1])

21Documents/DOC01.txt 21Documents/DOC02.txt
0.11764705882352941
21Documents/DOC01.txt 21Documents/DOC15.txt
0.033783783783783786
21Documents/DOC20.txt 21Documents/DOC21.txt
0.009009009009009009


In [18]:
# (6)
KmeansObj = Kmeans(3)
KmeansObj.separate_kmeans_clusters(IdfUtil.get_all_documents())

[0 2 1 0]
[0 2 0 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0 2 0 0 0]


ValueError: n_samples=1 should be >= n_clusters=3.