In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('Data.csv', sep= ';', index_col=0)
df = df.dropna()
df.shape

(300, 3)

In [3]:
df.drop_duplicates(subset={'Text'}, inplace=True)
df.head(7)

Unnamed: 0_level_0,Text,Class,True class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,This is what it’s all about. The cut and thrus...,Sport,Sport
2.0,WHO WILL WIN? ITALY - 1.66 SWITZERLAND-6.0DRAW...,Sport,Sport
3.0,Laporta becomes a member of the RFEF Council.,Sport,Sport
4.0,HE'S DONE IT!! Eliud Kipchoge achieves 'the im...,Sport,Sport
5.0,I know this is not the first time I've said th...,Sport,Sport
6.0,Finish pencil work of Anthony Oluwafemi Olasen...,Sport,Sport
7.0,Greetings from the Sport Industry Awards!,Sport,Sport


In [4]:
j = df['True class'].unique().tolist()
for i in range (0, len(j)):
    df.loc[df['True class'] == j[i], 'True class'] = i
    df.loc[df['Class'] == j[i], 'Class'] = i
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
# A list of contractions from https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [6]:
def text_filter(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(nltk.corpus.stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [7]:
df['Tokenized Text'] = list(map(text_filter, df['Text']))

In [8]:
def lemmatized_words(text):
    lemm = nltk.stem.WordNetLemmatizer()
    df['lemmatized_text'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     df['Tokenized Text']))
    

lemmatized_words(df['Tokenized Text'])
df.head(7)

Unnamed: 0,Text,Class,True class,Tokenized Text,lemmatized_text
0,Dolores O'Riordan (born 1971 - died 2018) was ...,4,4,"[dolores, riordan, born, 1971, died, 2018, one...","[dolores, riordan, born, 1971, died, 2018, one..."
1,As they say “Genius leaves Clues” and Ronaldo ...,0,0,"[say, “, genius, leaves, clues, ”, ronaldo, ce...","[say, “, genius, leaf, clue, ”, ronaldo, certa..."
2,What's your favorite accessory??,4,4,"[favorite, accessory]","[favorite, accessory]"
3,What have we just witnessed?!!!!!!Congrats ben...,0,0,"[witnessed, congrats, benstokes38, congrats, e...","[witnessed, congrats, benstokes38, congrats, e..."
4,"Kalpop is the joy, the passion, the mood and t...",4,4,"[kalpop, joy, passion, mood, whole, vibe, around]","[kalpop, joy, passion, mood, whole, vibe, around]"
5,I've seen this video over and over again.,4,4,"[seen, video]","[seen, video]"
6,"Jan. 7, 1995 Green Day were #1 on the Billboar...",4,4,"[jan, 7, 1995, green, day, 1, billboard, moder...","[jan, 7, 1995, green, day, 1, billboard, moder..."


In [9]:
df['Label'] = 0
df.loc[df['True class'] == 0, ['True Label']] = 1
df.loc[df['Class'] == 0, ['Label']] = 1
df.head(7)

Unnamed: 0,Text,Class,True class,Tokenized Text,lemmatized_text,Label,True Label
0,Dolores O'Riordan (born 1971 - died 2018) was ...,4,4,"[dolores, riordan, born, 1971, died, 2018, one...","[dolores, riordan, born, 1971, died, 2018, one...",0,
1,As they say “Genius leaves Clues” and Ronaldo ...,0,0,"[say, “, genius, leaves, clues, ”, ronaldo, ce...","[say, “, genius, leaf, clue, ”, ronaldo, certa...",1,1.0
2,What's your favorite accessory??,4,4,"[favorite, accessory]","[favorite, accessory]",0,
3,What have we just witnessed?!!!!!!Congrats ben...,0,0,"[witnessed, congrats, benstokes38, congrats, e...","[witnessed, congrats, benstokes38, congrats, e...",1,1.0
4,"Kalpop is the joy, the passion, the mood and t...",4,4,"[kalpop, joy, passion, mood, whole, vibe, around]","[kalpop, joy, passion, mood, whole, vibe, around]",0,
5,I've seen this video over and over again.,4,4,"[seen, video]","[seen, video]",0,
6,"Jan. 7, 1995 Green Day were #1 on the Billboar...",4,4,"[jan, 7, 1995, green, day, 1, billboard, moder...","[jan, 7, 1995, green, day, 1, billboard, moder...",0,
