In [1]:
# Create the text dataset, we will use the wikipedia
import wikipedia as wp
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

In [2]:
data = wp.page( ).content

In [3]:
# lets print only summary of this page in a new variable "pargraph"
paragraph = wp.summary("FIFA World Cup",sentences=10)

In [4]:
print(paragraph)

The FIFA World Cup, often simply called the World Cup, is an international association football competition contested by the senior men's national teams of the members of the Fédération Internationale de Football Association (FIFA), the sport's global governing body. The championship has been awarded every four years since the inaugural tournament in 1930, except in 1942 and 1946 when it was not held because of the Second World War. The current champion is France, which won its second title at the 2018 tournament in Russia.
The current format involves a qualification phase, which takes place over the preceding three years, to determine which teams qualify for the tournament phase. In the tournament phase, 32 teams, including the automatically qualifying host nation(s), compete for the title at venues within the host nation(s) over about a month.
The 21 World Cup tournaments have been won by eight national teams. Brazil have won five times, and they are the only team to have played in e

In [5]:
# Now we will convert complete paragraph into sentences
sentences = nltk.sent_tokenize(paragraph,language="english")

In [6]:
print(sentences)

["The FIFA World Cup, often simply called the World Cup, is an international association football competition contested by the senior men's national teams of the members of the Fédération Internationale de Football Association (FIFA), the sport's global governing body.", 'The championship has been awarded every four years since the inaugural tournament in 1930, except in 1942 and 1946 when it was not held because of the Second World War.', 'The current champion is France, which won its second title at the 2018 tournament in Russia.', 'The current format involves a qualification phase, which takes place over the preceding three years, to determine which teams qualify for the tournament phase.', 'In the tournament phase, 32 teams, including the automatically qualifying host nation(s), compete for the title at venues within the host nation(s) over about a month.', 'The 21 World Cup tournaments have been won by eight national teams.', 'Brazil have won five times, and they are the only team

In [7]:
len(sentences)

10

There are total 10 sentences in the paragraphs

In [8]:
# Create function to apply the text cleaning(remove characters except alphabat and stopwords) 
# in complete paragraph and get the words with stemming
def clean_words(text):
    #convert into lowercase
    lower = text.lower()
    # remove all the characters excpet alphabat
    paragraph = re.sub("[^a-zA-Z]"," ",lower)
    # word tokanization of complete paragraph 
    token_words = nltk.word_tokenize(paragraph)
    # remove the stopwords
    words = [w for w in token_words if w not in set(nltk.corpus.stopwords.words('english'))]
    return words

In [9]:
words = clean_words(paragraph)

In [10]:
len(words)

146

In [11]:
# lets define the stemmer object
stemmer = PorterStemmer()

In [12]:
# Create function to apply the stemming in complete paragraph and get the words with stemming
def stem_words(text):
    #convert into lowercase
    lower = text.lower()
    # remove all the characters excpet alphabat
    paragraph = re.sub("[^a-zA-Z]"," ",lower)
    # word tokanization of complete paragraph 
    token_words = nltk.word_tokenize(paragraph)
    # Apply the stemming and also remove the stopwords
    words_stemming = [stemmer.stem(w) for w in token_words if w not in set(nltk.corpus.stopwords.words('english'))]
    return words_stemming

In [13]:
# words in all sentences after stemming
stem_words = stem_words(paragraph)

In [14]:
len(stem_words)

146

In [15]:
# Lets display the words with stem_words
list(zip(words,stem_words))

[('fifa', 'fifa'),
 ('world', 'world'),
 ('cup', 'cup'),
 ('often', 'often'),
 ('simply', 'simpli'),
 ('called', 'call'),
 ('world', 'world'),
 ('cup', 'cup'),
 ('international', 'intern'),
 ('association', 'associ'),
 ('football', 'footbal'),
 ('competition', 'competit'),
 ('contested', 'contest'),
 ('senior', 'senior'),
 ('men', 'men'),
 ('national', 'nation'),
 ('teams', 'team'),
 ('members', 'member'),
 ('f', 'f'),
 ('ration', 'ration'),
 ('internationale', 'international'),
 ('de', 'de'),
 ('football', 'footbal'),
 ('association', 'associ'),
 ('fifa', 'fifa'),
 ('sport', 'sport'),
 ('global', 'global'),
 ('governing', 'govern'),
 ('body', 'bodi'),
 ('championship', 'championship'),
 ('awarded', 'award'),
 ('every', 'everi'),
 ('four', 'four'),
 ('years', 'year'),
 ('since', 'sinc'),
 ('inaugural', 'inaugur'),
 ('tournament', 'tournament'),
 ('except', 'except'),
 ('held', 'held'),
 ('second', 'second'),
 ('world', 'world'),
 ('war', 'war'),
 ('current', 'current'),
 ('champion', '

In [16]:
# define the lemmatization object
lemmatizer = WordNetLemmatizer()

In [18]:
# Lets apply the Lemmatization
def lem_words(text):
    #convert into lowercase
    lower = text.lower()
    # remove all the characters excpet alphabat
    paragraph = re.sub("[^a-zA-Z]"," ",lower)
    # word tokanization of complete paragraph 
    token_words = nltk.word_tokenize(paragraph)
    # Apply the stemming and also remove the stopwords
    words_lemm = [lemmatizer.lemmatize(w) for w in token_words if w not in set(nltk.corpus.stopwords.words('english'))]
    return words_lemm

In [19]:
# words in all sentences after lemmatization
lemmitize_words = lem_words(paragraph)

In [20]:
len(lemmitize_words)

146

In [21]:
# Lets display the words with lemmitize_words
list(zip(words,lemmitize_words))

[('fifa', 'fifa'),
 ('world', 'world'),
 ('cup', 'cup'),
 ('often', 'often'),
 ('simply', 'simply'),
 ('called', 'called'),
 ('world', 'world'),
 ('cup', 'cup'),
 ('international', 'international'),
 ('association', 'association'),
 ('football', 'football'),
 ('competition', 'competition'),
 ('contested', 'contested'),
 ('senior', 'senior'),
 ('men', 'men'),
 ('national', 'national'),
 ('teams', 'team'),
 ('members', 'member'),
 ('f', 'f'),
 ('ration', 'ration'),
 ('internationale', 'internationale'),
 ('de', 'de'),
 ('football', 'football'),
 ('association', 'association'),
 ('fifa', 'fifa'),
 ('sport', 'sport'),
 ('global', 'global'),
 ('governing', 'governing'),
 ('body', 'body'),
 ('championship', 'championship'),
 ('awarded', 'awarded'),
 ('every', 'every'),
 ('four', 'four'),
 ('years', 'year'),
 ('since', 'since'),
 ('inaugural', 'inaugural'),
 ('tournament', 'tournament'),
 ('except', 'except'),
 ('held', 'held'),
 ('second', 'second'),
 ('world', 'world'),
 ('war', 'war'),
 ('