In [57]:
import spacy
import json
import re
from typing import List
from spacy.tokenizer import Tokenizer

In [2]:
def read_json(file: str):
    with open(file) as f:
        data = json.load(f)
    
    return data

In [8]:
def read_file(file: str):
    lines = []
    with open(file) as file_in:        
        for line in file_in:
            lines.append(line)
            
    return lines

# Заголовки новин

## 1. Форматування

### 1. напишіть програму, яка форматує заголовки за вказаними правилами

1) З великої літери потрібно писати слова довжиною 4 чи більше літер. <br/>
2) З великої літери потрібно писати перше і останнє слово заголовку, незалежно від частини мови. <br/>
3) З великої літери потрібно писати іменники, займенники, дієслова, прикметники, прислівники та підрядні сполучники. <br/>
4) Якщо слово написане через дефіс, велику літеру потрібно додати для кожної частинки слова (наприклад, правильно "Self-Reflection", а не "Self-reflection"). <br/>
5) З маленької літери потрібно писати всі інші частини мови: артиклі/визначники, сурядні сполучники, прийменники, частки, вигуки. <br/>

In [230]:
class HeadlineFormatter:
    
    TITLE_POS_TAGS = ['NOUN', 'PRON', 'PROPN', 'VERB', 'ADJ', 'ADV', 'SCONJ', 'AUX']
    LOWER_POS_TAGS = ['DET', 'CCONJ', 'PREP', 'PART', 'INTJ']
    
    LOWER_EXCEPTION = ["n't", "'s"]
    
    def __init__(self):
        nlp = spacy.load("en_core_web_sm")
        self.__tokenizer = spacy.load("en_core_web_sm")
        self.__upper_pattern = re.compile("[A-Z]+")

    
    def format(self, headline: str) -> str:
        tokens = self.__tokenize(headline)
        size = len(tokens)
        
        text_with_ws_list = []

        prev_title = False
        is_prev_punct = False
        
        start_quotes = False
    
        for i in range(0, size):
            token = tokens[i]
            text_with_ws = token.text_with_ws
            
            
#             if token.pos_ == 'PUNCT' and token.text == "'":
#                 if start_quotes:
#                     start_quotes = False
#                 else:
#                     start_quotes = True
            
            if start_quotes is True:
                text_with_ws_list.append(text_with_ws)
                continue

            if token.pos_ == 'PUNCT':
                is_prev_punct = True
                text_with_ws_list.append(text_with_ws)
                continue
                
            if len(token.text) > 1 and self.__is_upper(token.text):
                text_with_ws_list.append(text_with_ws)
                continue


            
            if i == 0 or i == size -1 or self.__should_be_title(token):
                text_with_ws = self.__title_token(token)
                prev_title = True
            else:
                if prev_title and is_prev_punct:
                    text_with_ws = self.__title_token(token)
                    prev_title = True
                else:
                    prev_title = False
                    
                    if self.__should_be_lower(token):
                        text_with_ws = text_with_ws.lower()
                    

            text_with_ws_list.append(text_with_ws)
            is_prev_punct = False
                    
        return self.__untokenize(text_with_ws_list)
    
    def __should_be_title(self, token):
        if self.__is_article(token.text) is True:
            return False
        
        if token.text in self.LOWER_EXCEPTION:
            return False
        
        if token.pos_ in self.TITLE_POS_TAGS:
            return True
        
        if len(token.text) >= 4:
            return True
        
        return False
    
    def __should_be_lower(self, token):
        if self.__is_article(token.text) is True:
            return True
        
        if token.pos_ in self.LOWER_POS_TAGS:
            return True
        
        return False


    def __is_upper(self, text):
        if self.__upper_pattern.fullmatch(text) is not None:
            return True
        return False
    
    def __is_article(self, text: str):
        if text.lower() in ['a', 'an', 'the']:
            return True
        
        return False


        
                
    def __title_token(self, token)-> str:
        return token.text_with_ws.title()
    
    def __title(self, token):
        #TODO: consider "-
        return token.text_with_ws.title()
    
    
    def __tokenize(self, text):
        return self.__tokenizer(text)
    
    def __untokenize(self, text_with_ws_list):
        return ''.join(text_with_ws for text_with_ws in text_with_ws_list)
        

In [231]:
headline_formatter = HeadlineFormatter()

In [232]:
headline_formatter.format("Dicks Creek: Georgia's Go-to Trout Water")

"Dicks Creek: Georgia's Go-To Trout Water"

In [233]:
headline_formatter.format("Friday Fun: Project Runway's Kayne at SWS, Manuel dances for charity, Laura Bell Bundy sings at PLAY")

"Friday Fun: Project Runway's Kayne at SWS, Manuel Dances for Charity, Laura Bell Bundy Sings at PLAY"

### 2. перевірте якість роботи програми на валідаційній вибірці

In [260]:
def calculate_accuracy(expected_headlines: List[str], actual_headlines: List[str], headlines):
    
    assert len(expected_headlines) == len(actual_headlines)
    tp = 0
    
    for i in range(0, len(expected_headlines)):
        if expected_headlines[i] == actual_headlines[i]:
            tp += 1
    accuracy = tp / len(expected_headlines)
    
    return accuracy

In [261]:
val_file = "../../../tasks/02-structural-linguistics/data/headlines-test-set.json"

In [262]:
val_data = read_json(val_file)

In [263]:
expected_headlines = [item[1] for item in val_data]

In [264]:
headlines = [item[0] for item in val_data]

In [265]:
headline_formatter = HeadlineFormatter()

In [266]:
formatted_headlines = [headline_formatter.format(headline) for headline in headlines]

In [267]:
accuracy = calculate_accuracy(expected_headlines=expected_headlines, 
                              actual_headlines=formatted_headlines, 
                              headlines=headlines)

#### Accuracy

In [268]:
print("Accuracy: ", accuracy)

Accuracy:  0.69


### 3. проженіть вашу програму на корпусі заголовків з The Examiner і вирахуйте, скільки заголовків там відформатовано за правилами (скільки заголовків залишились незмінними після форматування)

In [255]:
examiner_file = "../../../tasks/02-structural-linguistics/data/examiner-headlines.txt"

In [256]:
examiner_lines = read_file(examiner_file)

In [258]:
headline_formatter = HeadlineFormatter()

In [259]:
num_changed = 0
for headline in examiner_lines:
    headline_formatted = headline_formatter.format(headline)
    if headline_formatted != headline:
        num_changed += 1
        
size = len(examiner_lines)
print(f"There were formatted {num_changed} out of {size}")
print(f"There were not  formatted {size - num_changed} out of {size}")

There were formatted 4412 out of 5000
There were not  formatted 588 out of 5000


## 2. Вірусні новини

In [123]:
import spacy
import nltk
from spacy import displacy
from collections import Counter
import en_core_web_sm
from nltk.corpus import sentiwordnet as swn

1. Напишіть програму, яка аналізує заголовок за першими трьома ознаками (у спрощеній формі) <br/>
1.1 Чи є в заголовку іменовані стуності? <br/>
1.2 Чи є заголовок позитивно чи негативно забарвлений? <br/>
1.3 Чи є в заголовку прикметники та прислівники вищого і найвищого ступенів порівняння? <br/>

In [125]:
class NERIdentifier:
    def __init__(self):
        self.nlp = en_core_web_sm.load()
    
    def contains_ner(self, text: str, person=False, org=False):
        doc = self.nlp(text)
        
        for ner_item in doc.ents:
            if person and ner_item.label_ == 'PERSON':
                return True
            
            if org and ner_item.label_ == 'ORG':
                return True
            
        return False

In [126]:
class SentimentScoreCalculator:

    MAX_COUNT_IN_SENTENCE = 5

    TAG_SENT_DICT = {
        'NOUN' : 'n',
        'VERB' : 'v',
        'ADJ' : 'a',
        'ADV' : 'r'
    }

    def __init__(self):
        self.wnl = nltk.WordNetLemmatizer()
        self.nlp = en_core_web_sm.load()

    def compute_pos_neg_score(self, headline: str):
        full_pos_score, full_neg_score = [], []
        count = 0

        doc = self.nlp(headline)
        for item in doc:
            sent_tag = self.get_tag_for_sentiment(item.pos_)
            if sent_tag is None:
                continue

            word = item.text
            lemmatizedsent = self.wnl.lemmatize(word)
            synsets = list(swn.senti_synsets(lemmatizedsent, sent_tag))

            pos_score, neg_score = 0, 0
            if len(synsets) > 0:
                for syn in synsets:
                    pos_score += syn.pos_score()
                    neg_score += syn.neg_score()

                if pos_score == 0 and neg_score == 0:
                    continue

                full_pos_score.append(pos_score / len(synsets))
                full_neg_score.append(neg_score / len(synsets))

                count += 1

            if count >= self.MAX_COUNT_IN_SENTENCE:
                break
        if count == 0:
            return 0, 0

        return sum(full_pos_score) / count, sum(full_neg_score) / count

    def get_tag_for_sentiment(self, pos_tag: str):
        if pos_tag not in self.TAG_SENT_DICT:
            return None

        return self.TAG_SENT_DICT[pos_tag]


In [127]:
class HeadlineAnalyzer:
    
    
    SENT_THRESHOLD = 0.5
    
    COMP_SUPER_LABELS = ['JJR', 'JJS', 'RBR', 'RBS']
    
    def __init__(self):
        self.__ner_identifier = NERIdentifier()  
        self.__sentiment_score_calculator = SentimentScoreCalculator()
    
    def match(self, headline: str):
        
        if self.contains_ner(headline) is False:
            return False
        
        if self.contains_enough_pos_or_neg_sentiments(headline) is False:
            return False
        
        if self.contains_comparative_or_superlative(headline) is False:
            return False
        
        
        return True
    
    def contains_ner(self, headline: str):
        return self.__ner_identifier.contains_ner(headline, person=True, org=True)
    
    
    
    def contains_comparative_or_superlative(self, headline: str):
        text = nltk.word_tokenize(headline)
        tags = nltk.pos_tag(text)
        
        for tag in tags:
            label = tag[1]
            if label in self.COMP_SUPER_LABELS:
                return True
            
        return False
    

    def contains_enough_pos_or_neg_sentiments(self, headline: str):
        pos_score, neg_score = self.__sentiment_score_calculator.compute_pos_neg_score(headline)
        
        if pos_score >= self.SENT_THRESHOLD:
            return True
        
        if neg_score >= self.SENT_THRESHOLD:
            return True
        
        return False

In [128]:
corpus_file = "../../../tasks/02-structural-linguistics/data/examiner-headlines.txt"

In [129]:
corpus = read_file(corpus_file)

In [130]:
headline_analyzer = HeadlineAnalyzer()

#### headlines with NER

In [98]:
ner_headlines = [headline for headline in corpus if headline_analyzer.contains_ner(headline)]

In [102]:
ner_headlines[:5]

['Halep enters Rogers Cup final in straight sets win over Errani\n',
 "Talladega turmoil could spell trouble for NASCAR's Chase field\n",
 '2011-2012 NHL team preview: Detroit Red Wings\n',
 'Cal coach Jeff Tedford taking a different approach in 2010 -- Part 1\n',
 "SF Beer Week 2013: what's for dinner (part 2)\n"]

In [100]:
ner_headlines_percent = 100 * len(ner_headlines) / len(corpus)

In [101]:
print(f"There are {ner_headlines_percent} % headlines with Named-entity recognition")

There are 50.26 % headlines with Named-entity recognition


#### headlines with positve or negative sentiments

In [131]:
pos_neg_headlines = [headline for headline in corpus if headline_analyzer.contains_enough_pos_or_neg_sentiments(headline)]

In [137]:
pos_neg_headlines[-5:]

["Britney Spears Hair: Yesterday's bad hair stem back from her shaving it all off [VIDEO]\n",
 'The worst marketing campaigns of 2011\n',
 '20 photos of retired Military Working Dog Gunnery Sgt. Lucca K458\n',
 'Public League Coaches above it all again with wrong message.\n',
 'Support Richmond caregivers and their loved ones on Pancreatic Cancer Advocacy Day\n']

In [133]:
pos_neg_headlines_percent = 100 * len(pos_neg_headlines) / len(corpus)

In [134]:
print(f"There are {pos_neg_headlines_percent} % headlines with Named-entity recognition")

There are 1.62 % headlines with Named-entity recognition


#### headlines with comparative and superlative adjectives and adverbs

In [107]:
comparative_superlative_headlines = [headline for headline in corpus if headline_analyzer.contains_comparative_or_superlative(headline)]

In [109]:
comparative_superlative_headlines[:5]

["Jersey Shore Season 6 cast's salaries revealed; More than President Obama!\n",
 "Sweeter than 'The Hummingbird and the Honey Bee'\n",
 'Study finds that young Americans are Democrats, want government to do more\n',
 'Ooh la la! What a most sensational woman Josephine Baker was, and is!\n',
 'Best 2014 Black Friday TV deals online: Amazon, Best Buy, Walmart, Target\n']

In [110]:
comp_super_headlines_per = 100 * len(comparative_superlative_headlines) / len(corpus)

In [111]:
print(f"There are {comp_super_headlines_per} % headlines with comparative or superlative adjectives or adverbs")

There are 4.38 % headlines with comparative or superlative adjectives or adverbs


#### Headlines with NER and positive or negative sentiments and comparative or superlative adjectives and adverbs

In [138]:
interested_headlines = [headline for headline in corpus if headline_analyzer.match(headline)]

In [139]:
interested_headlines

['Best 2014 Black Friday TV deals online: Amazon, Best Buy, Walmart, Target\n',
 "World's most expensive gingerbread house\n",
 'Is Glenn Mills the best sprint coach in the world? (Video)\n',
 "George Eliot is 'A Most Dangerous Woman' at Shakespeare Theatre of New Jersey\n",
 'Weekly sustainable seafood from H&H Fresh Fish: Santa Cruz, Campbell, and more\n',
 "UCLA's kidney transplant survival rate is best in the U.S.\n"]

In [140]:
percent = 100 * len(interested_headlines) / len(corpus)

In [141]:
print(f"There are {percent} % interesting headlines")

There are 0.12 % interesting headlines
