<div style="font-size:25pt; line-height:100px; font-weight:700">NLP: preprocessing</div>

# Imports

In [37]:
# Generics
import pandas as pd
import unicodedata
pd.set_option('display.max_columns', None)

# punctuation
import string

# regex
import re

# stopwords
from nltk.corpus import stopwords as nltksw

# tokenization (split(" "))
from nltk.tokenize import word_tokenize

# lemmatizer
from nltk.stem import WordNetLemmatizer

# Bag of words
from sklearn.feature_extraction.text import CountVectorizer

# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Utils definitions

## To lower case, removes tabulations, end-of-lines, carriage returns

In [2]:
def lowerize(df, label):
    """ text lowercase
        removes \n
        removes \t
        removes \r """
    df[label] = df[label].str.lower()
    df[label] = df[label].apply(lambda x: x.replace("\n", " "))
    df[label] = df[label].apply(lambda x: x.replace("\r", " "))
    df[label] = df[label].apply(lambda x: x.replace("\t", " "))
    return df

## Removes emails

In [19]:
def remove_emails(df, label):
    """ This function removes email adresses
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""", " ", x))
    return df

## Remove mentions

In [20]:
def remove_mentions(df, label):
    """ This function removes mentions (Twitter - starting with @) from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"@([a-zA-Z0-9_.-]{1,100})", " ", x))
    return df

## Remove hyperlinks

In [21]:
def remove_hyperlinks(df, label):
    """ This function removes hyperlinks from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"http\S+", " ", x))
    return df

## Removes htags

In [22]:
def remove_hashtags(df, label):
    """ This function removes hashtags
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"#\w+", " ", x))
    return df

## Removes HTML tags

In [23]:
def remove_html_tags(df, label):
    """ This function removes html tags from texts
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"<.*?>", " ", x))
    return df

## Remove numbers

In [24]:
def remove_numbers(df, label):
    """ This function removes numbers from a text
        inputs:
         - text """
    df[label] = df[label].apply(lambda x: re.sub(r"\d+", " ", x))
    return df

## Encode unknown characters

In [25]:
def encode_unknown(df, label):
    """ This function encodes special caracters """
    df[label] = df[label].apply(lambda x: unicodedata.normalize("NFD", x).encode('ascii', 'ignore').decode("utf-8"))
    return df

## Remove punctuation (english only)

In [26]:
def clean_punctuation_no_accent(df, label):
    """ This function removes punctuation and accented characters from texts in a dataframe 
        To be appplied to languages that have no accents, ex: english 
    """
    df[label] = df[label].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    return df

## Remove stop words

In [27]:
def remove_stop_words(text, stopwords=set(nltksw.words('english'))):
    """ This function removes stop words from a text
        inputs:
         - stopword list
         - text """

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()
    
    # stop words updated
    #stopwords = stopwords.union({"grocery store", "covid", "supermarket", "people", "grocery", "store", "price", "time"})
    
    # loop
    for word in text_splitted:
        if word not in stopwords:
            text_new.append(word)
    return " ".join(text_new)

def clean_stopwords(df, label):
    """ This function removes stopwords """
    df[label] = df[label].apply(lambda x: remove_stop_words(x))
    return df

## More cleaning

In [28]:
def more_cleaning(df, label):
    """ This function
     1) removes remaining one-letter words and two letters words
     2) replaces multiple spaces by one single space
     3) drop empty lines """
    df[label] = df[label].apply(lambda x: re.sub(r'\b\w{1,2}\b', " ", x))
    df[label] = df[label].apply(lambda x: re.sub(r"[ \t]{2,}", " ", x))
    df[label] = df[label].apply(lambda x: x if len(x) != 1 else '')
    df[label] = df[label].apply(lambda x: np.nan if x == '' else x)
    df = df.dropna(subset=[label], axis=0).reset_index(drop=True).copy()
    return df

## Lemmatization

In [81]:
def lemmatize_one_text(text):
    """ This function lemmatizes words in text (it changes word to most close root word)
        inputs:
         - lemmatizer
         - text """

    # initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # tags
    lem_tags = ['a', 'r', 'n', 'v']

    # prepare new text
    text_splitted = text.split(" ")
    text_new = list()

    # change bool
    changed = ''
    
    # loop
    for word in text_splitted:
        changed = ''
        for tag in lem_tags:
            if lemmatizer.lemmatize(word, tag) != word:
                changed = tag
        if changed == '':
            text_new.append(word)
        else:
            text_new.append(lemmatizer.lemmatize(word, changed))

    return " ".join(text_new)

def lemmatize(df, label):
    """ This function lemmatizes texts """
    df[label] = df[label].apply(lambda x: lemmatize_one_text(x))
    return df

## Vocab richness (optionnal)

In [30]:
def vocabulary_richness(text):
    """ This function returns vocabulary richness of a text
        inputs:
         - text """
    tokens = word_tokenize(text)
    total_length = len(tokens)
    uniques = set(tokens)
    unique_length = len(uniques)
    return unique_length / total_length

# Define some texts

In [12]:
texts = ["After shouldn't won't a discussion of Purdue's greatest Olympian we talk about the Athlon Sports article that gives anonymous quotes about Big Ten football teams. The quotes for Purdue are... not good.",
         "football, also called association football or soccer, game in which two teams of 11 players, using any part of their bodies except their hands and arms, try to maneuver the ball into the opposing team’s goal.",
         "Julia Grosso netted the winning penalty as Canada claimed their first-ever gold medal in football to break Sweden hearts after a dramatic shootout at Yokohama Stadium.",
         "With the start of the Premier League season just four days away, Manchester City are set to welcome a number of first-team players back to training this week. However, they will be without Phil Foden for the first few games of the campaign due to injury while Kevin De Bruyne is also recovering from an ankle issue.",
         "Barcelona said medical tests showed that Aguero has a tendon injury in his right calf and will be sidelined for about 10 weeks.",
         "The six-time Ballon d'Or winner had an emotional Barcelona farewell today and is now set to embark on a new chapter in the French capital, with the official announcement expected to come over the next few days."]

In [13]:
texts_df = pd.DataFrame({"text": texts})
display(texts_df.sample(5))

Unnamed: 0,text
3,With the start of the Premier League season ju...
1,"football, also called association football or ..."
4,Barcelona said medical tests showed that Aguer...
0,After shouldn't won't a discussion of Purdue's...
5,The six-time Ballon d'Or winner had an emotion...


# Preprocessing

In [82]:
col_label = "textClean"
df_train = texts_df.copy()
df_train[col_label] = texts_df["text"]

## Lowercase

In [83]:
df_train = lowerize(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove emails

In [84]:
df_train = remove_emails(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove mentions

In [85]:
df_train = remove_mentions(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove hyperlinks

In [86]:
df_train = remove_hyperlinks(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove hashtags

In [87]:
df_train = remove_hashtags(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove HTML tags

In [88]:
df_train = remove_html_tags(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove numbers

In [89]:
df_train = remove_numbers(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Encode unknown characters

In [90]:
df_train = encode_unknown(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn't won't a discussion of purdue's...
1,"football, also called association football or ...","football, also called association football or ..."
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove punctuation (english only)

In [91]:
df_train = clean_punctuation_no_accent(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,after shouldn t won t a discussion of purdue s...
1,"football, also called association football or ...",football also called association football or ...
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted the winning penalty as can...
3,With the start of the Premier League season ju...,with the start of the premier league season ju...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed that aguer...


## Remove stop words

In [92]:
df_train = clean_stopwords(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,discussion purdue greatest olympian talk athlo...
1,"football, also called association football or ...",football also called association football soc...
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted winning penalty canada cla...
3,With the start of the Premier League season ju...,start premier league season four days away ma...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed aguero ten...


## More cleaning

In [93]:
df_train = more_cleaning(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,discussion purdue greatest olympian talk athlo...
1,"football, also called association football or ...",football also called association football socc...
2,Julia Grosso netted the winning penalty as Can...,julia grosso netted winning penalty canada cla...
3,With the start of the Premier League season ju...,start premier league season four days away man...
4,Barcelona said medical tests showed that Aguer...,barcelona said medical tests showed aguero ten...


## Lemmatization

In [94]:
df_train = lemmatize(df_train, col_label)
df_train.head(5)

Unnamed: 0,text,textClean
0,After shouldn't won't a discussion of Purdue's...,discussion purdue great olympian talk athlon s...
1,"football, also called association football or ...",football also call association football soccer...
2,Julia Grosso netted the winning penalty as Can...,julia grosso net win penalty canada claim firs...
3,With the start of the Premier League season ju...,start premier league season four day away manc...
4,Barcelona said medical tests showed that Aguer...,barcelona say medical test show aguero tendon ...


# Bags of words

bag of words = no context, just a word count

In [95]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts_df["text"])
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [96]:
X_df

Unnamed: 0,10,11,about,after,aguero,also,an,and,ankle,announcement,anonymous,any,are,arms,article,as,association,at,athlon,away,back,ball,ballon,barcelona,be,big,bodies,break,bruyne,calf,called,campaign,canada,capital,chapter,city,claimed,come,days,de,discussion,dramatic,due,embark,emotional,ever,except,expected,farewell,few,first,foden,football,for,four,french,from,game,games,gives,goal,gold,good,greatest,grosso,had,hands,has,hearts,his,however,in,injury,into,is,issue,julia,just,kevin,league,manchester,maneuver,medal,medical,netted,new,next,not,now,number,of,official,olympian,on,opposing,or,over,part,penalty,phil,players,premier,purdue,quotes,recovering,right,said,season,set,shootout,shouldn,showed,sidelined,six,soccer,sports,stadium,start,sweden,talk,team,teams,ten,tendon,tests,that,the,their,they,this,time,to,today,training,try,two,using,we,week,weeks,welcome,which,while,will,winner,winning,with,without,won,yokohama
0,0,0,2,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,2,2,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,2,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,1,3,0,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,4,0,1,1,0,3,0,1,0,0,0,0,1,0,1,0,1,1,0,0,1,1,0,0
4,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0


# TF IDF representation

Three hyperparameters <br>
-> <b>min_df</b>: used to filter anomalies missed during preprocessing <br>
-> <b>max_df</b>: to remove words that appears too often in most of texts and thus have no significance anymore <br>
-> <b>max_features</b>: to keep only the most important words

In [28]:
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(texts_df["text"])
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vec.get_feature_names())

In [29]:
X_tfidf_df

Unnamed: 0,aguero,also,ankle,announcement,anonymous,are,arm,article,association,athlon,away,back,ball,ballon,barcelona,big,body,break,bruyne,calf,called,campaign,canada,capital,chapter,city,claimed,come,day,de,discussion,dramatic,due,embark,emotional,ever,except,expected,farewell,first,foden,football,four,french,game,give,goal,gold,good,greatest,grosso,ha,hand,heart,however,injury,is,issue,julia,kevin,league,manchester,maneuver,medal,medical,netted,new,next,not,number,official,olympian,opposing,part,penalty,phil,player,premier,purdue,quote,recovering,right,said,season,set,shootout,should,showed,sidelined,six,soccer,sport,stadium,start,sweden,talk,team,ten,tendon,test,time,today,training,try,two,using,week,welcome,will,winner,winning,without,yokohama
0,0.0,0.0,0.0,0.0,0.176473,0.14471,0.0,0.176473,0.0,0.176473,0.0,0.0,0.0,0.0,0.0,0.176473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122174,0.0,0.0,0.0,0.176473,0.0,0.0,0.176473,0.176473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.529418,0.0,0.0,0.176473,0.0,0.0,0.0,0.0,0.0,0.0,0.352946,0.352946,0.0,0.0,0.0,0.0,0.0,0.0,0.176473,0.0,0.0,0.0,0.0,0.176473,0.0,0.0,0.0,0.176473,0.122174,0.176473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122174,0.0,0.0,0.0,0.0
1,0.0,0.179577,0.0,0.0,0.0,0.0,0.218993,0.0,0.218993,0.0,0.0,0.0,0.218993,0.0,0.0,0.0,0.218993,0.0,0.0,0.0,0.218993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218993,0.0,0.0,0.0,0.0,0.303223,0.0,0.0,0.179577,0.0,0.218993,0.0,0.0,0.0,0.0,0.0,0.218993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218993,0.218993,0.0,0.0,0.179577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218993,0.0,0.0,0.0,0.0,0.0,0.303223,0.0,0.0,0.0,0.0,0.0,0.0,0.218993,0.218993,0.218993,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.19247,0.0,0.162496,0.0,0.0,0.0,0.0,0.0,0.234715,0.0,0.0,0.234715,0.0,0.0,0.234715,0.0,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.0,0.234715,0.0,0.234715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234715,0.0,0.0,0.0,0.0,0.0,0.0,0.234715,0.0,0.234715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234715,0.0,0.234715
3,0.0,0.141256,0.17226,0.0,0.0,0.141256,0.0,0.0,0.0,0.0,0.17226,0.17226,0.0,0.0,0.0,0.0,0.0,0.0,0.17226,0.0,0.0,0.17226,0.0,0.0,0.0,0.17226,0.0,0.0,0.141256,0.17226,0.0,0.0,0.17226,0.0,0.0,0.0,0.0,0.0,0.0,0.282511,0.17226,0.0,0.17226,0.0,0.141256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17226,0.141256,0.141256,0.17226,0.0,0.17226,0.17226,0.17226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17226,0.0,0.0,0.0,0.0,0.0,0.17226,0.141256,0.17226,0.0,0.0,0.17226,0.0,0.0,0.17226,0.141256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17226,0.0,0.0,0.119258,0.0,0.0,0.0,0.0,0.0,0.17226,0.0,0.0,0.0,0.141256,0.17226,0.119258,0.0,0.0,0.17226,0.0
4,0.282882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231967,0.0,0.0,0.0,0.0,0.282882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282882,0.0,0.0,0.0,0.231967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282882,0.282882,0.0,0.0,0.0,0.0,0.282882,0.282882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282882,0.282882,0.0,0.0,0.0,0.0,0.0,0.0,0.231967,0.0,0.195842,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.225362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225362,0.1848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225362,0.225362,0.0,0.0,0.225362,0.1848,0.0,0.0,0.0,0.0,0.225362,0.225362,0.0,0.0,0.225362,0.225362,0.0,0.0,0.0,0.0,0.225362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225362,0.225362,0.0,0.0,0.225362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1848,0.0,0.0,0.0,0.0,0.225362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225362,0.225362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225362,0.0,0.0,0.0


# N-Gram representation

Capture context by considering series of words instead of individual words<br>
Useful for sentiment analysis (good, not good)<br>
Parameters in TF IDF ngram_range

In [32]:
tfidf_gram = TfidfVectorizer(ngram_range = (1, 2))
X_tfidf_gram = tfidf_gram.fit_transform(texts_df["text"])
X_tfidf_gram_df = pd.DataFrame(X_tfidf_gram.toarray(), columns=tfidf_gram.get_feature_names())

In [34]:
display(X_tfidf_gram_df)
display(X_tfidf_gram_df.shape)

Unnamed: 0,aguero,aguero ha,also,also called,also recovering,ankle,ankle issue,announcement,announcement expected,anonymous,anonymous quote,are,are not,are set,arm,arm try,article,article give,association,association football,athlon,athlon sport,away,away manchester,back,back training,ball,ball opposing,ballon,ballon winner,barcelona,barcelona farewell,barcelona said,big,big ten,body,body except,break,break sweden,bruyne,bruyne is,calf,calf will,called,called association,campaign,campaign due,canada,canada claimed,capital,capital official,chapter,chapter french,city,city are,claimed,claimed first,come,come next,day,day away,de,de bruyne,discussion,discussion purdue,dramatic,dramatic shootout,due,due injury,embark,embark new,emotional,emotional barcelona,ever,ever gold,except,except hand,expected,expected come,farewell,farewell today,first,first ever,first game,first team,foden,foden first,football,football also,football break,football soccer,football team,four,four day,french,french capital,game,game campaign,game two,give,give anonymous,goal,gold,gold medal,good,greatest,greatest olympian,grosso,grosso netted,ha,ha tendon,hand,hand arm,heart,heart dramatic,however,however will,injury,injury kevin,injury right,is,is also,is set,issue,julia,julia grosso,kevin,kevin de,league,league season,manchester,manchester city,maneuver,maneuver ball,medal,medal football,medical,medical test,netted,netted winning,new,new chapter,next,next day,not,not discussion,not good,not will,number,number first,official,official announcement,olympian,olympian talk,opposing,opposing team,part,part body,penalty,penalty canada,phil,phil foden,player,player back,player using,premier,premier league,purdue,purdue are,purdue greatest,quote,quote big,quote purdue,recovering,recovering ankle,right,right calf,said,said medical,season,season four,set,set embark,set welcome,shootout,shootout yokohama,should,should not,showed,showed aguero,sidelined,sidelined week,six,six time,soccer,soccer game,sport,sport article,stadium,start,start premier,sweden,sweden heart,talk,talk athlon,team,team goal,team player,team quote,ten,ten football,tendon,tendon injury,test,test showed,time,time ballon,today,today is,training,training week,try,try maneuver,two,two team,using,using part,week,week however,welcome,welcome number,will,will not,will sidelined,will without,winner,winner emotional,winning,winning penalty,without,without phil,yokohama,yokohama stadium
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134705,0.134705,0.11046,0.134705,0.0,0.0,0.0,0.134705,0.134705,0.0,0.0,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093258,0.0,0.0,0.0,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134705,0.134705,0.0,0.0,0.0,0.134705,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.404115,0.134705,0.134705,0.134705,0.0,0.0,0.0,0.0,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26941,0.134705,0.134705,0.26941,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.134705,0.134705,0.093258,0.0,0.0,0.134705,0.134705,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093258,0.134705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.127254,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214873,0.155185,0.0,0.155185,0.0,0.0,0.0,0.0,0.0,0.127254,0.0,0.155185,0.0,0.0,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.155185,0.155185,0.0,0.0,0.0,0.0,0.127254,0.0,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214873,0.155185,0.127254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155185,0.155185,0.155185,0.155185,0.155185,0.155185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.136382,0.166317,0.0,0.0,0.0,0.0,0.115143,0.0,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.0,0.0,0.166317,0.166317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166317,0.166317,0.0,0.0,0.166317,0.166317
3,0.0,0.0,0.098453,0.0,0.120062,0.120062,0.120062,0.0,0.0,0.0,0.0,0.098453,0.0,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.098453,0.120062,0.120062,0.120062,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.196905,0.0,0.120062,0.120062,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.098453,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.098453,0.120062,0.0,0.098453,0.120062,0.0,0.120062,0.0,0.0,0.120062,0.120062,0.120062,0.120062,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.098453,0.120062,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.120062,0.120062,0.098453,0.0,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.083121,0.0,0.098453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0,0.0,0.0,0.0,0.0,0.098453,0.120062,0.120062,0.120062,0.083121,0.0,0.0,0.120062,0.0,0.0,0.0,0.0,0.120062,0.120062,0.0,0.0
4,0.198043,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162398,0.0,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198043,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198043,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.162398,0.0,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198043,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198043,0.198043,0.198043,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198043,0.198043,0.198043,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198043,0.198043,0.198043,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162398,0.0,0.0,0.0,0.137108,0.0,0.198043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.130162,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.158731,0.158731,0.0,0.0,0.0,0.0,0.158731,0.158731,0.130162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.158731,0.158731,0.0,0.0,0.0,0.0,0.158731,0.158731,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130162,0.0,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130162,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158731,0.158731,0.0,0.0,0.0,0.0,0.0,0.0


(6, 243)

# Vocab richness test

In [37]:
texts_df["vocabulary_richness"] = texts_df["text"].apply(lambda x: vocabulary_richness(x))

In [38]:
texts_df

Unnamed: 0,text,vocabulary_richness
0,should not will not discussion purdue greatest...,0.833333
1,football also called association football socc...,0.909091
2,julia grosso netted winning penalty canada cla...,1.0
3,start premier league season four day away manc...,0.972973
4,barcelona said medical test showed aguero ha t...,1.0
5,six time ballon winner emotional barcelona far...,1.0


# TF IDF with stop words parameters

<b>Synthesis of this test</b>: the stopword parameter of tfidf is just another list of stopwords, different from the nltk one

In [108]:
texts_A = ["After a discussion of Purdue's greatest Olympian we talk about the Athlon Sports article that gives anonymous quotes about Big Ten football teams. The quotes for Purdue are... not good.",
           "football, also called association football or soccer, game in which two teams of 11 players, using any part of their bodies except their hands and arms, try to maneuver the ball into the opposing team’s goal.",
           "Julia Grosso netted the winning penalty as Canada claimed their first-ever gold medal in football to break Sweden hearts after a dramatic shootout at Yokohama Stadium.",
           "With the start of the Premier League season just four days away, Manchester City are set to welcome a number of first-team players back to training this week. However, they will be without Phil Foden for the first few games of the campaign due to injury while Kevin De Bruyne is also recovering from an ankle issue.",
           "Barcelona said medical tests showed that Aguero has a tendon injury in his right calf and will be sidelined for about 10 weeks.",
           "The six-time Ballon d'Or winner had an emotional Barcelona farewell today and is now set to embark on a new chapter in the French capital, with the official announcement expected to come over the next few days."]

## Without stopwords parameter

In [109]:
vecto_check = TfidfVectorizer()

In [117]:
X_tfidf_check = vecto_check.fit_transform(texts_A)
X_tfidf_check_df = pd.DataFrame(X_tfidf_check.toarray(), columns=vecto_check.get_feature_names())
display(X_tfidf_check_df)
display(X_tfidf_check_df.shape)

Unnamed: 0,10,11,about,after,aguero,also,an,and,ankle,announcement,anonymous,any,are,arms,article,as,association,at,athlon,away,back,ball,ballon,barcelona,be,big,bodies,break,bruyne,calf,called,campaign,canada,capital,chapter,city,claimed,come,days,de,discussion,dramatic,due,embark,emotional,ever,except,expected,farewell,few,first,foden,football,for,four,french,from,game,games,gives,goal,gold,good,greatest,grosso,had,hands,has,hearts,his,however,in,injury,into,is,issue,julia,just,kevin,league,manchester,maneuver,medal,medical,netted,new,next,not,now,number,of,official,olympian,on,opposing,or,over,part,penalty,phil,players,premier,purdue,quotes,recovering,right,said,season,set,shootout,showed,sidelined,six,soccer,sports,stadium,start,sweden,talk,team,teams,ten,tendon,tests,that,the,their,they,this,time,to,today,training,try,two,using,we,week,weeks,welcome,which,while,will,winner,winning,with,without,yokohama
0,0.0,0.0,0.300092,0.150046,0.0,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.150046,0.0,0.18298,0.0,0.0,0.0,0.18298,0.0,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126679,0.126679,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.0,0.18298,0.18298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.0,0.126679,0.0,0.18298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.365959,0.365959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.0,0.0,0.18298,0.0,0.150046,0.18298,0.0,0.0,0.150046,0.187491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.176448,0.0,0.0,0.0,0.14469,0.0,0.122157,0.0,0.0,0.0,0.176448,0.0,0.176448,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.0,0.244314,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.104679,0.0,0.176448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244314,0.0,0.0,0.0,0.176448,0.14469,0.0,0.176448,0.0,0.0,0.14469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.0,0.14469,0.14469,0.0,0.0,0.0,0.0,0.180798,0.28938,0.0,0.0,0.0,0.104679,0.0,0.0,0.176448,0.176448,0.176448,0.0,0.0,0.0,0.0,0.176448,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.173017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.210992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.0,0.173017,0.0,0.146073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.210992,0.0,0.0,0.0,0.210992,0.0,0.0,0.125173,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.210992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.210992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108097,0.173017,0.0,0.0,0.0,0.125173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210992,0.0,0.0,0.210992
3,0.0,0.0,0.0,0.0,0.0,0.110984,0.110984,0.0,0.135344,0.0,0.0,0.0,0.110984,0.0,0.0,0.0,0.0,0.0,0.0,0.135344,0.135344,0.0,0.0,0.0,0.110984,0.0,0.0,0.0,0.135344,0.0,0.0,0.135344,0.0,0.0,0.0,0.135344,0.0,0.0,0.110984,0.135344,0.0,0.0,0.135344,0.0,0.0,0.0,0.0,0.0,0.0,0.110984,0.221967,0.135344,0.0,0.0937,0.135344,0.0,0.135344,0.0,0.135344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135344,0.0,0.110984,0.0,0.110984,0.135344,0.0,0.135344,0.135344,0.135344,0.135344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135344,0.2811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135344,0.110984,0.135344,0.0,0.0,0.135344,0.0,0.0,0.135344,0.110984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135344,0.0,0.0,0.110984,0.0,0.0,0.0,0.0,0.0,0.277361,0.0,0.135344,0.135344,0.0,0.240882,0.0,0.135344,0.0,0.0,0.0,0.0,0.135344,0.0,0.135344,0.0,0.135344,0.110984,0.0,0.0,0.110984,0.135344,0.0
4,0.233475,0.0,0.191453,0.0,0.233475,0.0,0.0,0.161638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191453,0.191453,0.0,0.0,0.0,0.0,0.233475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233475,0.0,0.233475,0.0,0.138511,0.191453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233475,0.233475,0.0,0.0,0.0,0.233475,0.233475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233475,0.233475,0.191453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233475,0.0,0.0,0.0,0.191453,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.143141,0.12085,0.0,0.174559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174559,0.143141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174559,0.174559,0.0,0.0,0.174559,0.143141,0.0,0.0,0.0,0.0,0.174559,0.174559,0.0,0.0,0.174559,0.174559,0.143141,0.0,0.0,0.0,0.0,0.0,0.174559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174559,0.0,0.0,0.0,0.0,0.0,0.103559,0.0,0.0,0.143141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174559,0.174559,0.0,0.174559,0.0,0.0,0.174559,0.0,0.174559,0.0,0.143141,0.174559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143141,0.0,0.0,0.0,0.174559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357726,0.0,0.0,0.0,0.174559,0.207118,0.174559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174559,0.0,0.143141,0.0,0.0


(6, 148)

## With stopwords parameter

In [112]:
vecto_check_sw = TfidfVectorizer(stop_words='english')

In [116]:
X_tfidf_check_sw = vecto_check_sw.fit_transform(texts_A)
X_tfidf_check_df_sw = pd.DataFrame(X_tfidf_check_sw.toarray(), columns=vecto_check_sw.get_feature_names())
display(X_tfidf_check_df_sw)
display(X_tfidf_check_df_sw.shape)

Unnamed: 0,10,11,aguero,ankle,announcement,anonymous,arms,article,association,athlon,away,ball,ballon,barcelona,big,bodies,break,bruyne,calf,called,campaign,canada,capital,chapter,city,claimed,come,days,discussion,dramatic,embark,emotional,expected,farewell,foden,football,french,game,games,gives,goal,gold,good,greatest,grosso,hands,hearts,injury,issue,julia,just,kevin,league,manchester,maneuver,medal,medical,netted,new,number,official,olympian,opposing,penalty,phil,players,premier,purdue,quotes,recovering,right,said,season,set,shootout,showed,sidelined,soccer,sports,stadium,start,sweden,talk,team,teams,tendon,tests,time,today,training,try,using,week,weeks,welcome,winner,winning,yokohama
0,0.0,0.0,0.0,0.0,0.0,0.222763,0.0,0.222763,0.0,0.222763,0.0,0.0,0.0,0.0,0.222763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222763,0.0,0.0,0.0,0.0,0.0,0.0,0.154222,0.0,0.0,0.0,0.222763,0.0,0.0,0.222763,0.222763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222763,0.0,0.0,0.0,0.0,0.0,0.445527,0.445527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222763,0.0,0.0,0.0,0.222763,0.0,0.182669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.236133,0.0,0.0,0.0,0.0,0.236133,0.0,0.236133,0.0,0.0,0.236133,0.0,0.0,0.0,0.236133,0.0,0.0,0.0,0.236133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.326955,0.0,0.236133,0.0,0.0,0.236133,0.0,0.0,0.0,0.0,0.236133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236133,0.0,0.0,0.193632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236133,0.0,0.0,0.0,0.0,0.0,0.193632,0.193632,0.0,0.0,0.0,0.0,0.0,0.236133,0.236133,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.246338,0.0,0.0,0.0,0.0,0.246338,0.0,0.0,0.0,0.246338,0.0,0.0,0.0,0.246338,0.0,0.0,0.0,0.0,0.0,0.170543,0.0,0.0,0.0,0.0,0.0,0.246338,0.0,0.0,0.246338,0.0,0.246338,0.0,0.0,0.246338,0.0,0.0,0.0,0.0,0.0,0.246338,0.0,0.246338,0.0,0.0,0.0,0.0,0.0,0.246338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.246338,0.0,0.0,0.0,0.0,0.246338,0.0,0.246338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.246338,0.246338
3,0.0,0.0,0.0,0.202601,0.0,0.0,0.0,0.0,0.0,0.0,0.202601,0.0,0.0,0.0,0.0,0.0,0.0,0.202601,0.0,0.0,0.202601,0.0,0.0,0.0,0.202601,0.0,0.0,0.166136,0.0,0.0,0.0,0.0,0.0,0.0,0.202601,0.0,0.0,0.0,0.202601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166136,0.202601,0.0,0.202601,0.202601,0.202601,0.202601,0.0,0.0,0.0,0.0,0.0,0.202601,0.0,0.0,0.0,0.0,0.202601,0.166136,0.202601,0.0,0.0,0.202601,0.0,0.0,0.202601,0.166136,0.0,0.0,0.0,0.0,0.0,0.0,0.202601,0.0,0.0,0.166136,0.0,0.0,0.0,0.0,0.0,0.202601,0.0,0.0,0.202601,0.0,0.202601,0.0,0.0,0.0
4,0.284615,0.0,0.284615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233388,0.0,0.0,0.0,0.0,0.284615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284615,0.284615,0.0,0.0,0.0,0.284615,0.284615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284615,0.284615,0.0,0.0,0.0,0.0,0.0,0.0,0.284615,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.242413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242413,0.198782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242413,0.242413,0.0,0.0,0.242413,0.198782,0.0,0.0,0.242413,0.242413,0.242413,0.242413,0.0,0.0,0.242413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242413,0.0,0.242413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242413,0.242413,0.0,0.0,0.0,0.0,0.0,0.0,0.242413,0.0,0.0


(6, 98)