In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Configure

In [3]:
TXT_TYPE = 'invention_title' # invention_title, abstract, claims
CLS_TYPE = 'Mno' # LLno, Lno, Mno, Sno, SSno

# 데이터 불러오기

In [4]:
!cp /content/drive/MyDrive/train.csv /content/train.csv
!cp /content/drive/MyDrive/val.csv /content/val.csv

In [5]:
train_raw = pd.read_csv('/content/train.csv')
test_raw = pd.read_csv('/content/val.csv')

In [6]:
train = train_raw.sample(n=50000, random_state=999) 
test = test_raw.sample(n=5000, random_state=999) 

In [7]:
train = train[[TXT_TYPE, CLS_TYPE]]
test = test[[TXT_TYPE, CLS_TYPE]]

train.rename(columns={TXT_TYPE: 'text'}, inplace=True)
train.rename(columns={CLS_TYPE: 'label'}, inplace=True)
test.rename(columns={TXT_TYPE: 'text'}, inplace=True)
test.rename(columns={CLS_TYPE: 'label'}, inplace=True)

In [8]:
train.head()

Unnamed: 0,text,label
94919,퓨즈 스위치의 작동상태 식별 장치,281
135846,충격조절모듈을 구비한 원형 톱 장착 절삭 장치,259
163772,게르마늄이 함유된 곤달비를 이용한 소주의 제조방법,107
139748,컷트용 미용가위,259
59955,양변기 물탱크의 저장수면의 높이를 조절할 수 있는 오버플로우관을 이용한 겨울철 상수...,360


# Models

In [9]:
import re

class Tokenizer(object):
    def __init__(self, stop_words = [], signs_to_remove = ["?!#%&"]):
        self.stop_words = stop_words
        self.signs_to_remove = signs_to_remove

    def tokenize(self,text):
        return text.lower().split(' ')

    def remove_stop_words(self,token):
        if token in self.stop_words:
            return "stop_word"
        else:
            return token

    def remove_punctuation(self,token):
        return re.sub(str(self.signs_to_remove),"",token)

In [10]:
class NotSeen(Exception):
    """
    Exception for tokens which are not indexed 
    because never seen in the trainin data
    """
    def __init__(self, value):
        self.value = value
    
    def __str__(self):
        return "Token '{}' is never seen in the training set.".format(self.value)

In [11]:
import operator
from functools import reduce

class Classifier(object):
    """docstring for Classifier"""
    def __init__(self, trainedData, tokenizer):
        super(Classifier, self).__init__()
        self.data = trainedData
        self.tokenizer = tokenizer
        self.defaultProb = 0.000000001

    # ali ata bak
    def classify(self, text):
        
        documentCount = self.data.getDocCount()
        classes = self.data.getClasses()

        # only unique tokens
        tokens = list(set(self.tokenizer.tokenize(text)))
        
        probsOfClasses = {}

        for className in classes:
            
            # we are calculating the probablity of seeing each token 
            # in the text of this class
            # P(Token_1|Class_i)
            tokensProbs = [self.getTokenProb(token, className) for token in tokens]
            
            # calculating the probablity of seeing the the set of tokens
            # in the text of this class
            # P(Token_1|Class_i) * P(Token_2|Class_i) * ... * P(Token_n|Class_i)
            try:
                tokenSetProb = reduce(lambda a,b: a*b, (i for i in tokensProbs if i) ) 
            except:
                tokenSetProb = 0
            
            probsOfClasses[className] = tokenSetProb * self.getPrior(className)
        
        return sorted(probsOfClasses.items(), 
            key=operator.itemgetter(1), 
            reverse=True)


    def getPrior(self, className):
        return self.data.getClassDocCount(className) /  self.data.getDocCount()

    def getTokenProb(self, token, className):
        #p(token|Class_i)
        classDocumentCount = self.data.getClassDocCount(className)

        # if the token is not seen in the training set, so not indexed,
        # then we return None not to include it into calculations.
        try:
            tokenFrequency = self.data.getFrequency(token, className)
        except NotSeen as e:
            return None

        # this means the token is not seen in this class but others.
        if tokenFrequency is None:
            return self.defaultProb

        probablity =  tokenFrequency / classDocumentCount
        return probablity

In [12]:
class Trainer(object):
    def __init__(self, tokenizer):
        super(Trainer, self).__init__()
        self.tokenizer = tokenizer
        self.data = TrainedData()

    def train(self, text, className):
        """
        enhances trained data using the given text and class
        """
        self.data.increaseClass(className)

        tokens = self.tokenizer.tokenize(text)
        for token in tokens:
            token = self.tokenizer.remove_stop_words(token)
            token = self.tokenizer.remove_punctuation(token)
            self.data.increaseToken(token, className)

In [13]:
import sys

class TrainedData(object):
    def __init__(self):
        self.docCountOfClasses = {}
        self.frequencies = {}

    def increaseClass(self, className, byAmount = 1):
        self.docCountOfClasses[className] = self.docCountOfClasses.get(className, 0) + 1

    def increaseToken(self, token, className, byAmount = 1):
        if not token in self.frequencies:
                self.frequencies[token] = {}

        self.frequencies[token][className] = self.frequencies[token].get(className, 0) + 1

    def decreaseToken(self, token, className, byAmount=1):
        if token not in self.frequencies:
            raise NotSeen(token)
        foundToken = self.frequencies[token]
        if className not in self.frequencies:
            sys.stderr.write("Warning: token %s has no entry for class %s. Not decreasing.\n" % (token, className))
            return
        if foundToken[className] < byAmount:
            raise ArithmeticError("Could not decrease %s/%s count (%i) by %i, "
                                  "as that would result in a negative number." % (
                                      token, className, foundToken[className], byAmount))
        foundToken[className] -= byAmount

    def getDocCount(self):
        """
        returns all documents count
        """
        return sum(self.docCountOfClasses.values())

    def getClasses(self):
        """
        returns the names of the available classes as list
        """
        return self.docCountOfClasses.keys()

    def getClassDocCount(self, className):
        """
        returns document count of the class. 
        If class is not available, it returns None
        """
        return self.docCountOfClasses.get(className, None)

    def getFrequency(self, token, className):
        if token in self.frequencies:
            foundToken = self.frequencies[token]
            return foundToken.get(className)
        else:
            raise NotSeen(token)

# Train

In [14]:
tokenizer = Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"])

In [15]:
newsTrainer = Trainer(tokenizer)

In [16]:
for row in train.itertuples():
    newsTrainer.train(row.text, row.label)

# Test

In [17]:
newsClassifier = Classifier(newsTrainer.data, tokenizer)

In [18]:
total_cnt = 0
top1_cnt = 0
top5_cnt = 0

for row in test.itertuples():
    classification = newsClassifier.classify(row.text)
    ans = int(row.label)
    if int(classification[0][0]) == ans:
        top1_cnt += 1
        top5_cnt += 1
    else:
        for cls, _ in classification[1:5]:
            if int(cls) == ans:
                top5_cnt += 1
                break

    total_cnt += 1

print('TOP_1: ', top1_cnt / total_cnt)
print('TOP_5: ', top5_cnt / total_cnt)

TOP_1:  0.4544
TOP_5:  0.7446


## For Lno

In [None]:
total_cnt = 0
top1_cnt = 0
top5_cnt = 0

for row in test.itertuples():
    classification = newsClassifier.classify(row.text)
    ans = row.label
    if classification[0][0] == ans:
        top1_cnt += 1
        top5_cnt += 1
    else:
        for cls, _ in classification[1:5]:
            if cls == ans:
                top5_cnt += 1
                break

    total_cnt += 1

print('TOP_1: ', top1_cnt / total_cnt)
print('TOP_5: ', top5_cnt / total_cnt)