In [1]:
import spacy
import json
from typing import List
from spacy.tokenizer import Tokenizer

In [2]:
def read_json(file: str):
    with open(file) as f:
        data = json.load(f)
    
    return data

### 1. напишіть програму, яка форматує заголовки за вказаними правилами

1) З великої літери потрібно писати слова довжиною 4 чи більше літер. <br/>
2) З великої літери потрібно писати перше і останнє слово заголовку, незалежно від частини мови. <br/>
3) З великої літери потрібно писати іменники, займенники, дієслова, прикметники, прислівники та підрядні сполучники. <br/>
4) Якщо слово написане через дефіс, велику літеру потрібно додати для кожної частинки слова (наприклад, правильно "Self-Reflection", а не "Self-reflection"). <br/>
5) З маленької літери потрібно писати всі інші частини мови: артиклі/визначники, сурядні сполучники, прийменники, частки, вигуки. <br/>

In [264]:
class HeadlineFormatter:
    
    TITLE_POS_TAGS = ['NOUN', 'PRON', 'PROPN', 'VERB', 'ADJ', 'ADV', 'SCONJ', 'AUX']
    LOWER_POS_TAGS = ['DET', 'CCONJ', 'PREP', 'PART', 'INTJ']
    
    def __init__(self):
        nlp = spacy.load("en_core_web_sm")
#         self.__tokenizer = nlp.Defaults.create_tokenizer(nlp)
        self.__tokenizer = spacy.load("en_core_web_sm")

    
    def format(self, headline: str) -> str:
        tokens = self.__tokenize(headline)
        size = len(tokens)
        
        text_with_ws_list = [self.__title(tokens[0])]
#         text_with_ws_list = []

        
#         for token in tokens:
        prev_title = False
        is_prev_punct = False
    
        for i in range(1, size - 1):
            token = tokens[i]
            text_with_ws = token.text_with_ws

            if token.pos_ == 'PUNCT':
                is_prev_punct = True
                text_with_ws_list.append(text_with_ws)
                continue
        
            
            if self.__should_be_title(token):
                text_with_ws = self.__title_token(token)
                prev_title = True
            else:
                if prev_title and is_prev_punct:
                    print('We are here')
                    text_with_ws = self.__title_token(token)
                    prev_title = True
                else:
                    prev_title = False
                    
                    if token.pos_ in self.LOWER_POS_TAGS:
                        text_with_ws = text_with_ws.lower()
                    

            text_with_ws_list.append(text_with_ws)
            is_prev_punct = False
        
        text_with_ws_list.append(self.__title(tokens[-1]))
            
        return self.__untokenize(text_with_ws_list)
    
    def __should_be_title(self, token):
        if token.pos_ in self.TITLE_POS_TAGS:
            return True
        
        if len(token.text) >= 4:
            return True
        
        return False
                
    def __title_token(self, token)-> str:
        return token.text_with_ws.title()
    
    def __title(self, token):
        #TODO: consider "-
        return token.text_with_ws.title()
    
    
    def __tokenize(self, text):
        return self.__tokenizer(text)
    
    def __untokenize(self, text_with_ws_list):
        return ''.join(text_with_ws for text_with_ws in text_with_ws_list)
        

In [265]:
headline_formatter = HeadlineFormatter()

In [266]:
headline_formatter.format("Dicks Creek: Georgia's Go-to Trout Water")

We are here


"Dicks Creek: Georgia's Go-To Trout Water"

### 2. перевірте якість роботи програми на валідаційній вибірці

In [267]:
def calculate_accuracy(expected_headlines: List[str], actual_headlines: List[str], headlines):
    
    assert len(expected_headlines) == len(actual_headlines)
    tp = 0
    
    for i in range(0, len(expected_headlines)):
        if expected_headlines[i] == actual_headlines[i]:
            tp += 1
        else:
            print("#####################################################")
            print(headlines[i])
            print(expected_headlines[i])
            print(actual_headlines[i])
            print("#####################################################")
    accuracy = tp / len(expected_headlines)
    
    return accuracy

In [268]:
val_file = "../../../tasks/02-structural-linguistics/data/headlines-test-set.json"

In [269]:
val_data = read_json(val_file)

In [270]:
expected_headlines = [item[1] for item in val_data]

In [271]:
headlines = [item[0] for item in val_data]

In [272]:
expected_headlines[0]

'How to Design a College Curriculum to Help You in Life'

In [273]:
headlines[0]

'How To Design A College Curriculum to Help You in Life'

In [274]:
headline_formatter = HeadlineFormatter()

In [275]:
formatted_headlines = [headline_formatter.format(headline) for headline in headlines]

We are here
We are here
We are here
We are here
We are here
We are here
We are here


In [276]:
accuracy = calculate_accuracy(expected_headlines=expected_headlines, 
                              actual_headlines=formatted_headlines, 
                              headlines=headlines)

#####################################################
Teresa Giudice broke: 'RHONJ' star can't even put gas in her car
Teresa Giudice Broke: 'RHONJ' Star Can't Even Put Gas in Her Car
Teresa Giudice Broke: 'Rhonj' Star Can't Even Put Gas in her Car
#####################################################
#####################################################
Class Warfare: As American as apple pie
Class Warfare: As American as Apple Pie
Class Warfare: As American As Apple Pie
#####################################################
#####################################################
Pat Robertson's controversial remarks about Haiti not shared by most Nashville Christians
Pat Robertson's Controversial Remarks About Haiti Not Shared by Most Nashville Christians
Pat Robertson's Controversial Remarks About Haiti not Shared by Most Nashville Christians
#####################################################
#####################################################
Sarah Geronimo and Gerald Anderson's 

#### Accuracy

In [277]:
print("Accuracy: ", accuracy)

Accuracy:  0.5
