# Natural Language Processing. Lab 1
Professor: Vladimir Ivanov

Teaching Assistant: Aidar Valeev


## Task 1
Write a python program that does the following :
1. retrieve content of a wikipedia page on a topic of your choice - english 
2. retrieve content of a wikipedia page on a topic of your choice - language of your choice (i.e russian, french)
3. preprocess the data 
4. print distinct words statatistics, number of chapters, number of sentences, numerical values, number of entities with links

In [75]:
import wikipedia
import requests
import re
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

from collections import defaultdict

In [76]:
ru_link = "https://ru.wikipedia.org/wiki/Университет_Иннополис"
en_link = "https://en.wikipedia.org/wiki/Innopolis_University"

In [77]:
def get_num_values(text):
    return len(re.findall(r"[0-9]+", text))

def words(text):
    tokenizer = RegexpTokenizer(r"\w+")
    return tokenizer.tokenize(text)

def sentences(text):
    return sent_tokenize(text)

def count_words(tokenized_text):
    counter = defaultdict(lambda: 0)
    for word in tokenized_text:
        counter[word] += 1
    return counter

In [78]:
en_content = wikipedia.page("Innopolis_University")
wikipedia.set_lang("ru")
ru_content = wikipedia.page("Университет_Иннополис")

['== История ==',
 '== Проектная и научная деятельность ==',
 '== Учебные программы ==',
 '== Особенности обучения ==',
 '== Руководство ==',
 '== Наблюдательный совет университета ==',
 '== Профессорско-преподавательский состав ==',
 '== Численность студентов ==',
 '== Структура университета ==',
 '== Кампус университета ==',
 '== Финансирование университета ==',
 '== Сотрудничество университета ==',
 '== Рейтинги ==',
 '== Примечания ==',
 '== Источники ==']

In [96]:
import json

metrics = {
    "ru": {
        "links": len(ru_content.links), 
        "words": len(words(ru_content.content)),
        "sentences": len(sentences(ru_content.content)),
        "numericals": get_num_values(ru_content.content),
        "chapters": len(re.findall(r"==.*==", ru_content.content))
    },
    "en": {
        "links": len(en_content.links), 
        "words": len(words(en_content.content)),
        "sentences": len(sentences(en_content.content)),
        "numericals": get_num_values(en_content.content),
        "chapters": len(re.findall(r"==.*==", en_content.content))
    }
}

print(json.dumps(metrics, indent=2, default=str))

print(f"For russian text: {json.dumps(count_words(words(en_content.content)), indent=2, default=str, ensure_ascii=False)}\n\n")
print(f"For english text: {json.dumps(count_words(words(ru_content.content)), indent=2, default=str, ensure_ascii=False)}\n\n")

{
  "ru": {
    "links": 351,
    "words": 3117,
    "sentences": 151,
    "numericals": 212,
    "chapters": 28
  },
  "en": {
    "links": 71,
    "words": 3066,
    "sentences": 169,
    "numericals": 201,
    "chapters": 15
  }
}
For russian text: {
  "Университет": 18,
  "Иннополис": 44,
  "тат": 1,
  "университеты": 1,
  "автономияле": 1,
  "коммерцияле": 1,
  "булмаган": 1,
  "югары": 1,
  "белем": 1,
  "бирү": 1,
  "оешмасы": 1,
  "российская": 1,
  "автономная": 1,
  "некоммерческая": 1,
  "организация": 1,
  "высшего": 5,
  "образования": 9,
  "в": 107,
  "городе": 5,
  "Верхнеуслонский": 1,
  "район": 1,
  "Республика": 3,
  "Татарстан": 9,
  "специализирующаяся": 1,
  "на": 41,
  "образовании": 2,
  "исследованиях": 1,
  "и": 134,
  "разработках": 1,
  "области": 11,
  "информационных": 6,
  "технологий": 21,
  "робототехники": 11,
  "Первоначально": 1,
  "университет": 3,
  "действовал": 1,
  "Казань": 1,
  "с": 21,
  "сентября": 2,
  "2015": 9,
  "года": 31,
  "открыт": 1

## Task 2

Write a python program that does the following :
1. Retrieve data from sklearn --> (from sklearn.datasets import fetch_20newsgroups)
2. Preprocess the data 
3. Do classification using any classical machine learning method

In [80]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups()

In [81]:
# Remove extra spaces, quotes and e-mails
for key, _ in enumerate(dataset.data):
    dataset.data[key].replace(r"(From :\S*@\S*\s?) | (\') | (\")", "")
    dataset.data[key].replace(r"\s+", " ")

In [82]:
# Reference: https://blog.cambridgespark.com/tutorial-an-introduction-to-text-classification-d36769c593ba

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, make_scorer

vectorizer = TfidfVectorizer(stop_words="english", min_df=3)
X, y = dataset.data, dataset.target

lr_pipeline = Pipeline([
    ('TF/IDF Vectorizer', vectorizer),
    ('Multinomial Logistic Regression', LogisticRegression(class_weight="balanced",solver="lbfgs", multi_class="multinomial", max_iter=1000))
])


accuracy, precision, recall, f1 = make_scorer(accuracy_score), \
                                  make_scorer(precision_score, average="macro"), make_scorer(recall_score, average="macro"), make_scorer(f1_score, average="macro")
scoring = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1
}                                  
cross_validate(lr_pipeline, X, y, scoring=scoring)


{'fit_time': array([20.26185131, 19.30852914, 20.51253009, 19.19468284, 21.57319403]),
 'score_time': array([1.06772637, 1.11374068, 1.1017158 , 0.9401474 , 0.97732687]),
 'test_accuracy': array([0.89703933, 0.90057446, 0.89527176, 0.90278392, 0.9071618 ]),
 'test_precision': array([0.90079286, 0.90239564, 0.89663541, 0.90323813, 0.91120564]),
 'test_recall': array([0.89499542, 0.89895654, 0.89369611, 0.90027396, 0.90567413]),
 'test_f1': array([0.89697356, 0.90020187, 0.89461968, 0.90138168, 0.90750561])}

## Task 3

Write a python program that does the following :
1. Preprocess the given data
2. Find entities in the data using regular expressions: dates, names, locations
3. Anonymise the names of US presidents
4. Highlight the locations
5. Sort by dates

In [83]:
TEXT = [
    "Barack Obama was the 44th president of the US and he followed George W. Bush and was followed by Donald Trump in 2017",
    "As a young man, George H.W. Bush served in World War II as a fighter pilot. In 1944, he was shot down and had to parachute to safety.",
    "Before he was president, George W. Bush was a cheerleader, a fraternity brother, an oilman, an owner of a professional baseball team, and a governor. After leaving office in 2009, Bush learned to paint.",
    "Here's something else you probably didn't know about John Adams: He died on the Fourth of July. And he wasn't the only commander in chief to do so. In fact, three of the nation's five founding fathers—Adams, Thomas Jefferson, and James Monroe—died on Independence Day. Adams and Jefferson even passed on the same exact day: July 4, 1826, which happened to be the 50th anniversary of the adoption of the Declaration of Independence.",
    "At 6 feet 4 inches tall, Abraham Lincoln and Lyndon B. Johnson were America's tallest presidents. But what about America's shortest president? That distinction goes to founding father James Madison (1809-1817), who, at 5 feet 4 inches tall, was a full foot shorter than his tallest peers.",
    "That changed, however, in October 1860, when Lincoln received a letter from an 11-year-old girl named Grace Bedell. 'If you will let your whiskers grow I will try and get [my brothers] to vote for you,' Bedell wrote to Lincoln. 'You would look a great deal better for your face is so thin. All the ladies like whiskers and they would tease their husbands to vote for you and then you would be president.'",
    "Richard Nixon was hardly the first president who liked to unwind by rolling a few strikes. Harry S. Truman also enjoyed bowling, and opened the first White House bowling alley in 1947. ",
    "If you had to bet on which U.S. president was the biggest movie fan, you'd probably put your money on America's actor-turned-president, Ronald Reagan (1981-1989). And that would be a great guess. Reagan reportedly watched 363 movies during his two terms in office.",
    "Thomas Jefferson offered to sell his personal library when the Library of Congress was burned by the British during the War of 1812. He sold them 6487 books from his own collection, the largest in America at the time.",
    "Born in New York in 1782, Martin Van Buren was the first president to have been born after the American Revolution, technically making him the first American-born president.",
    "Benjamin Harrison had a tight-knit family and loved to amuse and dote on his grandchildren. He put up the first recorded White House Christmas tree in 1889, and was known to put on the Santa suit for entertainment.",
    "A 16-year-old Bill Clinton managed to shake hands with President John F. Kennedy at a Boys Nation event in 1963. This would take place just four months before Kennedy's assassination.",
    "In 1993—two years before he became the governor of Texas—George W. Bush ran the Houston marathon, finishing with a time of 3:44:52. He is the only president to have ever run a marathon.",
]

In [84]:
%pip install datefinder

Note: you may need to restart the kernel to use updated packages.


In [85]:
presidents = requests.get("https://gist.githubusercontent.com/namuol/2657233/raw/74135b2637e624848c163759be9cd14ae33f5153/presidents.csv").text
presidents = presidents.split('\n')[1:]
presidents = [y.split(',')[1] for y in presidents]
presidents
presidents += ["Donald Trump", "Joe Biden"]

In [86]:
from datefinder import find_dates

In [87]:
%pip install geotext

Note: you may need to restart the kernel to use updated packages.


In [88]:
from geotext import GeoText
def find_locations(text):
    geotext = GeoText(text)
    return geotext.cities

In [91]:
# Reference: https://stackoverflow.com/questions/20290870/improving-the-extraction-of-human-names-with-nltk

import nltk

name = r"[A-Z][A-Za-z]+\s[A-Z](?:\.\s[A-Z][A-Za-z]+|[A-Za-z]+)"


def get_human_names(text):
    person_list = []
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)

    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        for part in person:
                name += part + ' '
        if name[:-1] not in person_list:
                person_list.append(name[:-1])
        name = ''
        person = []
    return person_list

def in_extracted(name_, extracted_names):
    parts = name_.split(' ')
    first, last = parts[0], parts[-1]
    return first in extracted_names or last in extracted_names or name_ in extracted_names

def process(text: str):
    names = re.findall(name, text)
    extracted_names = get_human_names(text)
    i = 1
    anonymizer = {}

    for name_ in set.intersection(set(presidents), set(names)):
    # for name_ in presidents:
        anonymizer[name_] = i
        i += 1
    print(anonymizer)
    for extracted_name in extracted_names:
        for name_, i in list(anonymizer.items()):
            if extracted_name in name_.split(" ") or extracted_name == name:
                anonymizer[extracted_name] = i


    output = "" + text
    for name_, i in anonymizer.items():
        output = output.replace(name_, f"X{i}")
    locations = find_locations(text)
    for location in locations:
        output = output.replace(location, f"**{location}**")
    dates = list(find_dates(text))
    
    return output, extracted_names, dates, locations

for sentence in TEXT:
    modified_text, names, dates, locations = process(sentence)
    dates = [date.strftime("%Y-%m-%d") for date in dates]
    print(sentence)
    print(modified_text)
    print(f"Names: {names}")
    print(f"Dates: {dates}")
    print(f"Locations: {locations}")

{'Barack Obama': 1, 'George W. Bush': 2, 'Donald Trump': 3}
Barack Obama was the 44th president of the US and he followed George W. Bush and was followed by Donald Trump in 2017
X1 was the 44th president of the US and he followed X2 and was followed by X3 in 2017
Names: ['Barack', 'Obama', 'George W. Bush', 'Donald Trump']
Dates: ['2044-01-30', '2023-02-01']
Locations: ['George']
{}
As a young man, George H.W. Bush served in World War II as a fighter pilot. In 1944, he was shot down and had to parachute to safety.
As a young man, **George** H.W. Bush served in World War II as a fighter pilot. In 1944, he was shot down and had to parachute to safety.
Names: ['George', 'Bush']
Dates: ['1944-01-30']
Locations: ['George']
{'George W. Bush': 1}
Before he was president, George W. Bush was a cheerleader, a fraternity brother, an oilman, an owner of a professional baseball team, and a governor. After leaving office in 2009, Bush learned to paint.
Before he was president, X1 was a cheerleader, 

In [90]:
LOCATIONS = [...]
NAMES = [...]



## Task 4

Write a python program that does the following :
1. Preprocess the data from Task 3
2. Find entities in the data using [Gazetteers](https://gatenlp.readthedocs.io/en/latest/gazetteers/) from gatenlp: dates, names, locations
3. Anonymise the names of US presidents
4. Highlight the locations
5. Sort by dates
