In [1]:
# Libs initialization
import pandas as pd
import numpy as np
import collections
import statistics
import math
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re

# import scipy
# from scipy.signal import fftconvolve

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 1. Data file preparation
</br> Retrieving Data from a dataset collected using the Twitter API, renaming classes and randomizing.

In [2]:
df = pd.read_csv('Data.csv', sep= ';', index_col=0)
df = df.dropna()
df.shape
df.drop_duplicates(subset={'Text'}, inplace=True)
df.head(7)

Unnamed: 0_level_0,Text,Class,True class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,This is what it’s all about. The cut and thrus...,Sport,Sport
2,WHO WILL WIN? ITALY - 1.66 SWITZERLAND-6.0DRAW...,Sport,Sport
3,Laporta becomes a member of the RFEF Council.,Sport,Sport
4,HE'S DONE IT!! Eliud Kipchoge achieves 'the im...,Sport,Sport
5,I know this is not the first time I've said th...,Sport,Sport
6,Finish pencil work of Anthony Oluwafemi Olasen...,Sport,Sport
7,Greetings from the Sport Industry Awards!,Sport,Sport


In [3]:
# Converting Class names to codes/numbers
j = df['True class'].unique().tolist()
z = df['True class'].unique().tolist()
for i in range (0, len(j)):
    df.loc[df['True class'] == j[i], 'True class'] = i
    df.loc[df['Class'] == j[i], 'Class'] = i
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
# A List of English contractions from https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
c_dict = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"1st": "first",
"2nd": "second",
"3rd": "third",
"4th": "forth",
"5th": "fifth",
"6th": "sixth",
"7th": "seventh",
"8th": "eighth",
"9th": "ninth"
}

# 2. Text Filtering
</br> Four-step raw tex filter, using [nltk](https://www.nltk.org) libraries. Filter includes:
* Constructions filter (special thanks for [arturomp](https://stackoverflow.com/users/583834/arturomp) for [converting](https://stackoverflow.com/posts/19794953/revisions) wikipedia contraction-to-expansion page into a python dictionary)
* Stopwords filter
* Unwanted characters filter

In [5]:
def text_filter(text):
    
    # Convert words to lower case
    text = text.lower()

    # Remove constructions
    text = text.split()
    new_text = []
    for word in text:
        if word in c_dict:
            new_text.append(c_dict[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)

    # Remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # Remove SW
    text = text.split()
    sw = set(nltk.corpus.stopwords.words("english"))
    text = [w for w in text if not w in sw]
    text = " ".join(text)

    # Split numbers and words
    text = text.split()
    new_text2 = []
    for word in text:
        if (word.isalpha() or word.isdigit()):
            new_text2.append(word)
        else:
            for i in range(0,len(word)-1):
                if ((word[i].isdigit() and word[i+1].isalpha()) or (word[i+1].isdigit() and word[i].isalpha())):
                    word1 = word[0:(i+1)]
                    word2 = word[(i+1):len(word)]
                    new_text2.append(word1)
                    new_text2.append(word2)
    text = " ".join(new_text2)

    # Tokenization
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

def lemmatizing(text):
    lemm = nltk.stem.WordNetLemmatizer()
    df['TokenizedText'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     df['TokenizedText']))

In [6]:
df['TokenizedText'] = list(map(text_filter, df['Text']))
lemmatizing(df['TokenizedText'])
df.head(7)

Unnamed: 0,Text,Class,True class,TokenizedText
0,"Only when your activity is exuberant, will you...",0,0,"[activity, exuberant, keep, crossing, limit, s..."
1,Severn Class are having a great time in PE thi...,0,0,"[severn, class, great, time, pe, afternoon]"
2,Are the existing government structures and pol...,1,1,"[existing, government, structure, policy, adeq..."
3,"Seriously, if I provide my own hair, how much ...",2,2,"[seriously, provide, hair, much, would, cost, ..."
4,SPORTSART EXPANDS ITS ECO-POWR(TM) LINE WITH T...,1,0,"[sportsart, expands, eco, powr, tm, line, unve..."
5,Did you know that we are always looking for ne...,1,1,"[know, always, looking, new, company, onboard,..."
6,"Wimbledon, THIS ... and STILL an NBA Finals ga...",0,0,"[wimbledon, still, nba, final, game, tonight]"


In [7]:
# Creating full corpus of words
corpus = []
for i in range(0, len(df)):
    corpus.append(df.loc[i, 'TokenizedText'])
    # w_list = df.loc[i, 'Tokenized Text']
    # for j in range (0, len(w_list)):
    #     corpus.append(w_list[j])
# corpus = set(corpus)

# 3. TF-IDF
</br> This part count TF-IDF vectors for dataset based on the tokenized words

In [8]:
def tf(text):
    d = {}
    tf_text = collections.Counter(text)
    for i in tf_text:
        d[i] = tf_text[i]/float(len(text))
    return d
def idf(word, corpus):
        return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))

In [9]:
df['Words quantity'] = 0
df['TF-IDF mean'] = 0
df['TF-IDF sum'] = 0
df['TF-IDF'] = 0
list =[]

for i in range(0, len(df)):

    word_list = df.TokenizedText[i]

    wv = []
    t_f = tf(word_list)
    for j in range(0, len(word_list)):
        id_f = idf(word_list[j], corpus)
        t_f[word_list[j]] = t_f[word_list[j]] * id_f 
        wv.append(t_f[word_list[j]])

    # Addind the words quantity and the TF-IDF sum in the columns
    df.loc[i, ['Words quantity']] = len(word_list)
    df.loc[i, ['TF-IDF sum']] = sum(t_f.values())

    t_f.clear()
    # Deleting less importaint words if there are more than 5
    if len(word_list)>5:
        while len(wv)>5:
            word_list.pop(wv.index(min(wv)))
            wv.remove(min(wv))
        wv = []
        t_f = tf(word_list)
        for j in range(0, len(word_list)):
            id_f = idf(word_list[j], corpus)
            t_f[word_list[j]] = t_f[word_list[j]] * id_f 
            wv.append(t_f[word_list[j]])
        df.TokenizedText[i] = word_list
    
    # Addind the TF-IDF average in the columns
    df.loc[i, ['TF-IDF mean']] = statistics.mean(wv)

    # We create linear convolution of our TF-IDF parameter to fit it into list of 5 objects
    # That solves problems of many-variable TF-IDF vectors
    # y = np.array ([1, 1, 1, 1, 1])
    # npwv = fftconvolve(y, wv, mode='same', axes=None)
    # list.append(npwv)

    wvnp = np.zeros(5)
    for i in range(0, len(wv)):
        wvnp[i]=wv[i]
    list.append(wvnp)
df['TF-IDF'] = list
df.head(7)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.TokenizedText[i] = word_list


Unnamed: 0,Text,Class,True class,TokenizedText,Words quantity,TF-IDF mean,TF-IDF sum,TF-IDF
0,"Only when your activity is exuberant, will you...",0,0,"[limit, exuberant, activity, crossing, limit]",12,1.106642,5.269886,"[1.0771387677108921, 0.5385693838554461, 0.478..."
1,Severn Class are having a great time in PE thi...,0,0,"[severn, class, great, pe, afternoon]",6,0.443361,2.079307,"[0.5385693838554461, 0.3987753829882423, 0.322..."
2,Are the existing government structures and pol...,1,1,"[bomba, shed, kenya, security, expect]",20,0.538569,2.43097,"[0.5385693838554461, 0.5385693838554461, 0.538..."
3,"Seriously, if I provide my own hair, how much ...",2,2,"[hair, hair, 3, 3, make]",23,1.052554,2.497863,"[0.9567267694452997, 2.2883152784330987, 0.660..."
4,SPORTSART EXPANDS ITS ECO-POWR(TM) LINE WITH T...,1,0,"[tm, unveiling, 260, rower, consumer]",14,0.538569,2.442791,"[0.5385693838554461, 0.5385693838554461, 0.538..."
5,Did you know that we are always looking for ne...,1,1,"[company, onboard, industry, park, austria]",9,0.483361,2.037962,"[0.41815738558985355, 0.5385693838554461, 0.47..."
6,"Wimbledon, THIS ... and STILL an NBA Finals ga...",0,0,"[wimbledon, still, nba, final, tonight]",6,0.467443,2.131311,"[0.5385693838554461, 0.44314513291151353, 0.53..."


# 4. Filtered Data output
</br> Data is output to files by categories for Logistic Regression and to one file for other types of algorithms.

In [10]:
# Creating the final dataset
df = df[['TokenizedText', 'TF-IDF', 'TF-IDF sum', 'Words quantity', 'TF-IDF mean', 'Class', 'True class']]
df.to_csv('FilteredData.csv', sep=';', encoding='utf-8', index=False)
df.head(7)

Unnamed: 0,TokenizedText,TF-IDF,TF-IDF sum,Words quantity,TF-IDF mean,Class,True class
0,"[limit, exuberant, activity, crossing, limit]","[1.0771387677108921, 0.5385693838554461, 0.478...",5.269886,12,1.106642,0,0
1,"[severn, class, great, pe, afternoon]","[0.5385693838554461, 0.3987753829882423, 0.322...",2.079307,6,0.443361,0,0
2,"[bomba, shed, kenya, security, expect]","[0.5385693838554461, 0.5385693838554461, 0.538...",2.43097,20,0.538569,1,1
3,"[hair, hair, 3, 3, make]","[0.9567267694452997, 2.2883152784330987, 0.660...",2.497863,23,1.052554,2,2
4,"[tm, unveiling, 260, rower, consumer]","[0.5385693838554461, 0.5385693838554461, 0.538...",2.442791,14,0.538569,1,0
5,"[company, onboard, industry, park, austria]","[0.41815738558985355, 0.5385693838554461, 0.47...",2.037962,9,0.483361,1,1
6,"[wimbledon, still, nba, final, tonight]","[0.5385693838554461, 0.44314513291151353, 0.53...",2.131311,6,0.467443,0,0
