In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('Data.csv', sep= ';', index_col=0)
df = df.dropna()
df.shape

(300, 3)

In [3]:
df.drop_duplicates(subset={'Text'}, inplace=True)
df.head(7)

Unnamed: 0_level_0,Text,Class,True class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,This is what it’s all about. The cut and thrus...,Sport,Sport
2.0,WHO WILL WIN? ITALY - 1.66 SWITZERLAND-6.0DRAW...,Sport,Sport
3.0,Laporta becomes a member of the RFEF Council.,Sport,Sport
4.0,HE'S DONE IT!! Eliud Kipchoge achieves 'the im...,Sport,Sport
5.0,I know this is not the first time I've said th...,Sport,Sport
6.0,Finish pencil work of Anthony Oluwafemi Olasen...,Sport,Sport
7.0,Greetings from the Sport Industry Awards!,Sport,Sport


In [4]:
j = df['True class'].unique().tolist()
z = df['True class'].unique().tolist()
for i in range (0, len(j)):
    df.loc[df['True class'] == j[i], 'True class'] = i
    df.loc[df['Class'] == j[i], 'Class'] = i
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
# A List of English contractions from https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
c_dict = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"1st": "first",
"2nd": "second",
"3rd": "third",
"4th": "forth",
"5th": "fifth",
"6th": "sixth",
"7th": "seventh",
"8th": "eighth",
"9th": "ninth"
}

In [6]:
def text_filter(text):
    
    # Convert words to lower case
    text = text.lower()
    # Remove constructions
    
    text = text.split()
    new_text = []
    for word in text:
        if word in c_dict:
            new_text.append(c_dict[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    # Remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    # Remove SW
    text = text.split()
    sw = set(nltk.corpus.stopwords.words("english"))
    text = [w for w in text if not w in sw]
    text = " ".join(text)
    # Split numbers and words
    text = text.split()
    new_text2 = []
    for word in text:
        if (word.isalpha() or word.isdigit()):
            new_text2.append(word)
        else:
            for i in range(0,len(word)-1):
                if ((word[i].isdigit() and word[i+1].isalpha()) or (word[i+1].isdigit() and word[i].isalpha())):
                    word1 = word[0:(i+1)]
                    word2 = word[(i+1):len(word)]
                    new_text2.append(word1)
                    new_text2.append(word2)
    text = " ".join(new_text2)
    # Tokenization
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [7]:
df['Tokenized Text'] = list(map(text_filter, df['Text']))

In [8]:
def lemmatizing(text):
    lemm = nltk.stem.WordNetLemmatizer()
    df['Tokenized Text'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     df['Tokenized Text']))
    
lemmatizing(df['Tokenized Text'])
df.head(7)

Unnamed: 0,Text,Class,True class,Tokenized Text
0,It's going to be a new year for Sports Unfolde...,0,0,"[going, new, year, sport, unfolded, begin, tra..."
1,"Find the brands you love, the latest trends, a...",2,2,"[find, brand, love, latest, trend, amazing, pa..."
2,"If your from Mayo, turn on Tg4 , best 10minute...",0,0,"[mayo, turn, tg, 4, best, 10, minute, 2021, ma..."
3,"Throw backs,2001 really gave us hitd",4,4,"[throw, back, 2001, really, gave, u, hitd]"
4,Match Poker brings nations and people together...,0,0,"[match, poker, brings, nation, people, togethe..."
5,When analyzing short sprints like the NFL Comb...,0,0,"[analyzing, short, sprint, like, nfl, combine,..."
6,Musical Ear Syndrome: How it happens and why,4,4,"[musical, ear, syndrome, happens]"


In [11]:
for i in range(0, len(z)):
    df['Label'] = 0
    df['True Label'] = 0
    df.loc[df['True class'] == i, ['True Label']] = 1
    df.loc[df['Class'] == i, ['Label']] = 1
    locals()[z[i]] = df [['Tokenized Text', 'Label', 'True Label']]
df.head(7)

Unnamed: 0,Text,Class,True class,Tokenized Text,Label,True Label
0,It's going to be a new year for Sports Unfolde...,0,0,"[going, new, year, sport, unfolded, begin, tra...",0,0
1,"Find the brands you love, the latest trends, a...",2,2,"[find, brand, love, latest, trend, amazing, pa...",0,0
2,"If your from Mayo, turn on Tg4 , best 10minute...",0,0,"[mayo, turn, tg, 4, best, 10, minute, 2021, ma...",0,0
3,"Throw backs,2001 really gave us hitd",4,4,"[throw, back, 2001, really, gave, u, hitd]",1,1
4,Match Poker brings nations and people together...,0,0,"[match, poker, brings, nation, people, togethe...",0,0
5,When analyzing short sprints like the NFL Comb...,0,0,"[analyzing, short, sprint, like, nfl, combine,...",0,0
6,Musical Ear Syndrome: How it happens and why,4,4,"[musical, ear, syndrome, happens]",1,1
