In [1]:
import pandas as pd
import numpy as np
import sklearn
import collections
import math
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zer0deck/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('Data.csv', sep= ';', index_col=0)
df = df.dropna()
df.shape

(300, 3)

In [3]:
df.drop_duplicates(subset={'Text'}, inplace=True)
df.head(7)

Unnamed: 0_level_0,Text,Class,True class
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,This is what it’s all about. The cut and thrus...,Sport,Sport
2.0,WHO WILL WIN? ITALY - 1.66 SWITZERLAND-6.0DRAW...,Sport,Sport
3.0,Laporta becomes a member of the RFEF Council.,Sport,Sport
4.0,HE'S DONE IT!! Eliud Kipchoge achieves 'the im...,Sport,Sport
5.0,I know this is not the first time I've said th...,Sport,Sport
6.0,Finish pencil work of Anthony Oluwafemi Olasen...,Sport,Sport
7.0,Greetings from the Sport Industry Awards!,Sport,Sport


In [4]:
j = df['True class'].unique().tolist()
z = df['True class'].unique().tolist()
for i in range (0, len(j)):
    df.loc[df['True class'] == j[i], 'True class'] = i
    df.loc[df['Class'] == j[i], 'Class'] = i
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
# A List of English contractions from https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
c_dict = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"1st": "first",
"2nd": "second",
"3rd": "third",
"4th": "forth",
"5th": "fifth",
"6th": "sixth",
"7th": "seventh",
"8th": "eighth",
"9th": "ninth"
}

In [6]:
def text_filter(text):
    
    # Convert words to lower case
    text = text.lower()
    # Remove constructions
    
    text = text.split()
    new_text = []
    for word in text:
        if word in c_dict:
            new_text.append(c_dict[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    # Remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    # Remove SW
    text = text.split()
    sw = set(nltk.corpus.stopwords.words("english"))
    text = [w for w in text if not w in sw]
    text = " ".join(text)
    # Split numbers and words
    text = text.split()
    new_text2 = []
    for word in text:
        if (word.isalpha() or word.isdigit()):
            new_text2.append(word)
        else:
            for i in range(0,len(word)-1):
                if ((word[i].isdigit() and word[i+1].isalpha()) or (word[i+1].isdigit() and word[i].isalpha())):
                    word1 = word[0:(i+1)]
                    word2 = word[(i+1):len(word)]
                    new_text2.append(word1)
                    new_text2.append(word2)
    text = " ".join(new_text2)
    # Tokenization
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [7]:
df['TokenizedText'] = list(map(text_filter, df['Text']))

In [8]:
def lemmatizing(text):
    lemm = nltk.stem.WordNetLemmatizer()
    df['TokenizedText'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     df['TokenizedText']))
    
lemmatizing(df['TokenizedText'])
df.head(7)

Unnamed: 0,Text,Class,True class,TokenizedText
0,Just making beats,4,4,"[making, beat]"
1,Trying to figure out how to play that dang thing,4,4,"[trying, figure, play, dang, thing]"
2,Summer break in NZ means kids home from school...,2,2,"[summer, break, nz, mean, kid, home, school, s..."
3,I want prints like stripes and stars and polka...,2,2,"[want, print, like, stripe, star, polka, dot, ..."
4,When did you 'discover' Eden Hazard?,0,0,"[discover, eden, hazard]"
5,We enjoyed listening to an orchestra in Music ...,4,4,"[enjoyed, listening, orchestra, music, today, ..."
6,HE'S DONE IT!! Eliud Kipchoge achieves 'the im...,0,0,"[done, eliud, kipchoge, achieves, impossible, ..."


# Creating uniq files for each class for logistic regression
for i in range(0, len(z)):
    df['Label'] = 0
    df['True Label'] = 0
    df.loc[df['True class'] == 0, ['True Label']] = 1
    df.loc[df['Class'] == 0, ['Label']] = 1
    locals()[z[i]] = df [['TokenizedText', 'Label', 'True Label']]
    # locals()[z[i]].to_csv(z[i]+'.csv', sep=';', encoding='utf-8', index=False)
# Creating the final dataset
df = df[['TokenizedText', 'Label', 'True Label']]
# df.to_csv(FilteredData.csv, sep=';', encoding='utf-8', index=False)
training_data, test_data = sklearn.model_selection.train_test_split(df, train_size = 0.7, random_state=42)
print(training_data.shape)
print(test_data.shape)
bow_transform = sklearn.feature_extraction.text.CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False) 
X_tr_bow = bow_transform.fit_transform(training_data['TokenizedText'])
len(bow_transform.vocabulary_)
X_tr_bow.shape
X_te_bow = bow_transform.transform(test_data['TokenizedText'])
y_tr = training_data['Label']
y_te = test_data['Label']
tfidf_transform = sklearn.feature_extraction.text.TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_transform.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_transform.transform(X_te_bow)
print(X_te_tfidf)

In [9]:
# Creating full corpus of words
corpus = []
for i in range(0, len(df)):
    corpus.append(df.loc[i, 'TokenizedText'])
    # w_list = df.loc[i, 'Tokenized Text']
    # for j in range (0, len(w_list)):
    #     corpus.append(w_list[j])
# corpus = set(corpus)

In [10]:
def tf(text):
    d = {}
    tf_text = collections.Counter(text)
    for i in tf_text:
        d[i] = tf_text[i]/float(len(text))
    return d
def idf(word, corpus):
        return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))

In [11]:
df['TF-IDF sum']=0
df['TF-IDF'] = 0
df['Words quantity'] = 0
list =[]
for i in range(0, len(df)):
    word_list = df.TokenizedText[i]
    wv = []
    t_f = tf(word_list)
    for j in range(0, len(word_list)):
        id_f = idf(word_list[j], corpus)
        t_f[word_list[j]] = t_f[word_list[j]] * id_f 
        wv.append(t_f[word_list[j]])
    # Addind the TF-IDF sum in the "TF-IDF sum" column
    df.loc[i, ['Words quantity']] = len(word_list)
    df.loc[i, ['TF-IDF sum']] = sum(t_f.values())
    # print(wv)
    list.append(wv)
    t_f.clear()
df['TF-IDF'] = list
df.head(7)

Unnamed: 0,Text,Class,True class,TokenizedText,TF-IDF sum,TF-IDF,Words quantity
0,Just making beats,4,4,"[making, beat]",1.934626,"[0.9360781363741464, 0.9985475046782963]",2
1,Trying to figure out how to play that dang thing,4,4,"[trying, figure, play, dang, thing]",2.137968,"[0.4346372536824548, 0.4948432528152511, 0.374...",5
2,Summer break in NZ means kids home from school...,2,2,"[summer, break, nz, mean, kid, home, school, s...",2.267846,"[0.274912918230695, 0.2414651409346971, 0.2749...",9
3,I want prints like stripes and stars and polka...,2,2,"[want, print, like, stripe, star, polka, dot, ...",2.214833,"[0.14133875114105096, 0.1664245841130494, 0.11...",12
4,When did you 'discover' Eden Hazard?,0,0,"[discover, eden, hazard]",2.373873,"[0.7243954228040913, 0.824738754692085, 0.8247...",3
5,We enjoyed listening to an orchestra in Music ...,4,4,"[enjoyed, listening, orchestra, music, today, ...",1.958537,"[0.16138602361274879, 0.22492875127965958, 0.2...",11
6,HE'S DONE IT!! Eliud Kipchoge achieves 'the im...,0,0,"[done, eliud, kipchoge, achieves, impossible, ...",2.159905,"[0.13313966729043952, 0.16494775093841701, 0.1...",15


In [12]:
print(df['Words quantity'].max())

30


In [13]:
# Creating uniq files for each class for logistic regression
for i in range(0, len(z)):
    df['Label'] = 0
    df['True Label'] = 0
    df.loc[df['True class'] == i, ['True Label']] = 1
    df.loc[df['Class'] == i, ['Label']] = 1
    locals()[z[i]] = df [['TokenizedText', 'Label', 'True Label']]
    # locals()[z[i]].to_csv(z[i]+'.csv', sep=';', encoding='utf-8', index=False)
# Creating the final dataset
df = df[['TokenizedText', 'Label', 'True Label']]
# df.to_csv(FilteredData.csv, sep=';', encoding='utf-8', index=False)