In [1]:
# Libs initialization
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re

# import scipy
# from scipy.signal import fftconvolve

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 1. Data file preparation
</br> Retrieving Data from a dataset collected using the Twitter API, renaming classes and randomizing.

In [2]:
df = pd.read_csv('Data.csv', sep= ';', index_col=0)
df = df.dropna()
df.shape
df.drop_duplicates(subset={'Text'}, inplace=True)
df.head(7)

Unnamed: 0_level_0,Text,Class,True class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,This is what it’s all about. The cut and thrus...,Sport,Sport
2,WHO WILL WIN? ITALY - 1.66 SWITZERLAND-6.0DRAW...,Sport,Sport
3,Laporta becomes a member of the RFEF Council.,Sport,Sport
4,HE'S DONE IT!! Eliud Kipchoge achieves 'the im...,Sport,Sport
5,I know this is not the first time I've said th...,Sport,Sport
6,Finish pencil work of Anthony Oluwafemi Olasen...,Sport,Sport
7,Greetings from the Sport Industry Awards!,Sport,Sport


In [3]:
# Converting Class names to codes/numbers
j = df['True class'].unique().tolist()
z = df['True class'].unique().tolist()
for i in range (0, len(j)):
    df.loc[df['True class'] == j[i], 'True class'] = i
    df.loc[df['Class'] == j[i], 'Class'] = i
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
# A List of English contractions from https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
c_dict = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"1st": "first",
"2nd": "second",
"3rd": "third",
"4th": "forth",
"5th": "fifth",
"6th": "sixth",
"7th": "seventh",
"8th": "eighth",
"9th": "ninth"
}

# 2. Text Filtering
</br> Four-step raw tex filter, using [nltk](https://www.nltk.org) libraries. Filter includes:
* Constructions filter (special thanks for [arturomp](https://stackoverflow.com/users/583834/arturomp) for [converting](https://stackoverflow.com/posts/19794953/revisions) wikipedia contraction-to-expansion page into a python dictionary)
* Stopwords filter
* Unwanted characters filter

In [5]:
def text_filter(text):
    
    # Convert words to lower case
    text = text.lower()

    # Remove constructions
    text = text.split()
    new_text = []
    for word in text:
        if word in c_dict:
            new_text.append(c_dict[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)

    # Remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # Remove SW
    text = text.split()
    sw = set(nltk.corpus.stopwords.words("english"))
    text = [w for w in text if not w in sw]
    text = " ".join(text)

    # Split numbers and words
    text = text.split()
    new_text2 = []
    for word in text:
        if (word.isalpha() or word.isdigit()):
            new_text2.append(word)
        else:
            for i in range(0,len(word)-1):
                if ((word[i].isdigit() and word[i+1].isalpha()) or (word[i+1].isdigit() and word[i].isalpha())):
                    word1 = word[0:(i+1)]
                    word2 = word[(i+1):len(word)]
                    new_text2.append(word1)
                    new_text2.append(word2)
    text = " ".join(new_text2)
        
    return text

def text_tokenizer(text):
    text =  nltk.WordPunctTokenizer().tokenize(text)
    return text

def joinclean(text):
    text = str(' '.join(text))
    return text

In [6]:
df['CleanedText'] = list(map(text_filter, df['Text']))
df['TokenizedText'] = list(map(text_tokenizer, df['CleanedText']))
lemm = nltk.stem.WordNetLemmatizer()
df['TokenizedText'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     df['TokenizedText']))
df['CleanedText'] = list(map(joinclean, df['TokenizedText']))
df.head(7)

Unnamed: 0,Text,Class,True class,CleanedText,TokenizedText
0,I've seen this video over and over again.,4,4,seen video,"[seen, video]"
1,"Throw backs,2001 really gave us hitd",4,4,throw back 2001 really gave u hitd,"[throw, back, 2001, really, gave, u, hitd]"
2,Salah or Auba: Who will win the AFCON Golden B...,0,0,salah auba win afcon golden boot,"[salah, auba, win, afcon, golden, boot]"
3,JUNGKOOK IS NOW ON SPOTIFY!!!! OHMYGOD,4,4,jungkook spotify ohmygod,"[jungkook, spotify, ohmygod]"
4,Strategic invitation to heads of five Central ...,1,1,strategic invitation head five central asian c...,"[strategic, invitation, head, five, central, a..."
5,GM. What is Music2Earn? MusicY is coming.,4,4,gm music 2earn music2 earn musicy coming,"[gm, music, 2earn, music2, earn, musicy, coming]"
6,Novak Djokovic is playing the Australian Open ...,0,0,novak djokovic playing australian open thanks ...,"[novak, djokovic, playing, australian, open, t..."


In [7]:
# Creating full corpus of words
corpus = []
for i in range(0, len(df)):
    corpus.append(df.loc[i, 'TokenizedText'])
    # w_list = df.loc[i, 'Tokenized Text']
    # for j in range (0, len(w_list)):
    #     corpus.append(w_list[j])
# corpus = set(corpus)
df['Words quantity'] = 0

for i in range(0, len(df)):

    word_list = df.TokenizedText[i]
    df.loc[i, ['Words quantity']] = len(word_list)
df.head(3)

Unnamed: 0,Text,Class,True class,CleanedText,TokenizedText,Words quantity
0,I've seen this video over and over again.,4,4,seen video,"[seen, video]",2
1,"Throw backs,2001 really gave us hitd",4,4,throw back 2001 really gave u hitd,"[throw, back, 2001, really, gave, u, hitd]",7
2,Salah or Auba: Who will win the AFCON Golden B...,0,0,salah auba win afcon golden boot,"[salah, auba, win, afcon, golden, boot]",6


# 3. Filtered Data output
</br> Data is output to files by categories for Logistic Regression and to one file for other types of algorithms.

In [8]:
# Creating the final dataset
df = df[['CleanedText', 'TokenizedText', 'Words quantity', 'Class', 'True class']]
df.to_csv('FilteredData.csv', sep=';', encoding='utf-8', index=False)
df.head(7)

Unnamed: 0,CleanedText,TokenizedText,Words quantity,Class,True class
0,seen video,"[seen, video]",2,4,4
1,throw back 2001 really gave u hitd,"[throw, back, 2001, really, gave, u, hitd]",7,4,4
2,salah auba win afcon golden boot,"[salah, auba, win, afcon, golden, boot]",6,0,0
3,jungkook spotify ohmygod,"[jungkook, spotify, ohmygod]",3,4,4
4,strategic invitation head five central asian c...,"[strategic, invitation, head, five, central, a...",12,1,1
5,gm music 2earn music2 earn musicy coming,"[gm, music, 2earn, music2, earn, musicy, coming]",7,4,4
6,novak djokovic playing australian open thanks ...,"[novak, djokovic, playing, australian, open, t...",16,0,0
