In [35]:
import os
from xml.dom import minidom
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import  accuracy_score
import nltk
from nltk.tokenize import TweetTokenizer
import emoji
from sklearn.utils import shuffle
from string import punctuation

In [36]:
authors = [el.split('.')[0] for el in os.listdir('en')]
labels = np.loadtxt('en/truth.txt', delimiter=':::', dtype=np.str)

In [84]:
train, test = train_test_split(authors, test_size=0.2, random_state=8)

In [85]:
len(train)

336

In [86]:
train = shuffle(train)

In [87]:
def parse(xml):
    tweets = []
    doc = minidom.parse(xml)
    doclist = doc.getElementsByTagName('document')
    for i in range(len(doclist)):
        tweet = doclist[i].firstChild.nodeValue
        tweets.append(tweet.rstrip('\n'))

    return np.array(tweets)

def binarize_label(label):
    if label == 'NI': return 0
    elif label == 'I': return 1

def compose_dataset(labels, authors):
    dataset = {}
    dataset_for_tokenizing = {}
    labels_out = []
    for label in labels:
        id, cls = label[0], label[1]
        if id in authors:
            dataset[id] = ''
            dataset_for_tokenizing[id] = []
            tweets = parse(os.path.join('en/', id + '.xml'))
            tweets = [emoji.demojize(tweet.replace('/n','')) for tweet in tweets]
            for tweet in tweets:
                dataset[id] += tweet
                dataset_for_tokenizing[id].append(tweet)
            labels_out.append(binarize_label(cls))
    return dataset, dataset_for_tokenizing, labels_out

train_ds, train_tokens, labels_train = compose_dataset(labels, train)
#val_ds, val_tokens, labels_val = compose_dataset(labels, val)
test_ds, test_tokens, labels_test = compose_dataset(labels, test)


## Statistical features
- Punctuation marks
- Average tweet length
- Emoticons 
- Capitalisation

In [88]:
def tokenize_tweets(ds):
    tk = TweetTokenizer()
    dout = {}
    for author in ds.keys():
        dout[author] = []
        for tweet in ds[author]:
            dout[author].append(tk.tokenize(tweet))
    return dout

In [89]:
def produce_features(tokenized_ds):
    dout = {}
    for author in tokenized_ds.keys():
        length = []
        capitalisation_counts = []
        punct_counts = []
        emoji_counts = []
        hashtags = []
        links = []
        user_mentions = []
        for tweet in tokenized_ds[author]:
            emojis = len(emoji.emoji_lis(emoji.emojize(''.join(tweet))))
            emoji_counts.append(emojis)
            length.append(len(tweet))
            pc = 0
            cc = 0
            hs = 0
            urls = 0
            usrs = 0
            for el in tweet:
                if el in punctuation and el != '#':
                    pc += 1
                if el.isupper():
                    cc += 1
                if el == '#HASHTAG':
                    hs += 1
                if el == '#URL':
                    urls += 1
                if el == '#USER':
                    usrs += 1
            punct_counts.append(pc)
            capitalisation_counts.append(cc)
            hashtags.append(hs)
            links.append(urls)
            user_mentions.append(usrs)
        dout[author] = [np.mean(length),
                        np.mean(capitalisation_counts),
                        np.mean(punct_counts),
                        np.mean(emoji_counts),
                        np.mean(hashtags),
                        np.mean(links),
                        np.mean(user_mentions)]
    return dout
            

In [90]:
tokenized_train = tokenize_tweets(train_tokens)
#tokenized_val = tokenize_tweets(val_tokens)
tokenized_test = tokenize_tweets(test_tokens)

In [91]:
features_train = produce_features(tokenized_train)
#features_val = produce_features(tokenized_val)
features_test = produce_features(tokenized_test)

  if sys.path[0] == '':


In [92]:
Xstat = np.array(list(features_train.values()))
#Xvalstat = np.array(list(features_val.values()))
Xteststat = np.array(list(features_test.values()))

## N-grams

In [93]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer(analyzer = 'word', ngram_range=(1,3), stop_words='english')
cv_char = CountVectorizer(analyzer = 'char', ngram_range=(1,3), stop_words='english')

bow = cv.fit_transform(train_ds.values())
bow_char = cv_char.fit_transform(train_ds.values())
Xword = bow
Xchar = bow_char

# bow = cv.transform(val_ds.values())
# bow_char = cv_char.transform(val_ds.values())
# Xvalword = bow
# Xvalchar = bow_char

bow = cv.transform(test_ds.values())
bow_char = cv_char.transform(test_ds.values())
Xtestword = bow
Xtestchar = bow_char

## Classification

In [47]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression

In [48]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=50),
    RandomForestClassifier(max_depth=50, n_estimators=100, max_features=1),
    AdaBoostClassifier(),
    MultinomialNB(),
    LogisticRegression(),
    
]

In [49]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Multinomial Naive Bayes",
    "Logistic Regression"
]

### Find the best classifier for the statistical feature set

In [16]:
for classifier, name in zip(classifiers, names):
    print('Classifier: ', name)
    classifier.fit(Xstat, labels_train)
    scores = cross_val_score(classifier, Xstat, labels_train, cv=5)
    print('CV score: ', scores.mean())

Classifier:  Nearest Neighbors
CV score:  0.7614035087719299
Classifier:  Linear SVM
CV score:  0.7349122807017543
Classifier:  RBF SVM
CV score:  0.7103157894736842
Classifier:  Decision Tree
CV score:  0.8171929824561402
Classifier:  Random Forest
CV score:  0.8781052631578946
Classifier:  AdaBoost
CV score:  0.8755438596491227
Classifier:  Multinomial Naive Bayes
CV score:  0.634280701754386
Classifier:  Logistic Regression
CV score:  0.7932280701754385


### Find the best classifier for the word level n-grams

In [17]:
for classifier, name in zip(classifiers, names):
    print('Classifier: ', name)
    classifier.fit(Xword, labels_train)
    y_pred = classifier.predict(Xtestword)
    scores = cross_val_score(classifier, Xword, labels_train, cv=5)
    print('Accuracy score: ', scores.mean())

Classifier:  Nearest Neighbors
Accuracy score:  0.8111929824561404
Classifier:  Linear SVM
Accuracy score:  0.8965263157894737
Classifier:  RBF SVM
Accuracy score:  0.5198947368421053
Classifier:  Decision Tree
Accuracy score:  0.8066315789473684
Classifier:  Random Forest
Accuracy score:  0.5516140350877192
Classifier:  AdaBoost
Accuracy score:  0.8993684210526316
Classifier:  Multinomial Naive Bayes
Accuracy score:  0.8697894736842106
Classifier:  Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Accuracy score:  0.893859649122807


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Find the best classifier for the character level n-grams

In [18]:
for classifier, name in zip(classifiers, names):
    print('Classifier: ', name)
    classifier.fit(Xchar, labels_train)
    y_pred = classifier.predict(Xtestchar)
    scores = cross_val_score(classifier, Xchar, labels_train, cv=5)
    print('Accuracy score: ', scores.mean())

Classifier:  Nearest Neighbors
Accuracy score:  0.8171578947368421
Classifier:  Linear SVM
Accuracy score:  0.8859298245614035
Classifier:  RBF SVM
Accuracy score:  0.5198947368421053
Classifier:  Decision Tree
Accuracy score:  0.8012280701754385
Classifier:  Random Forest
Accuracy score:  0.8196140350877194
Classifier:  AdaBoost
Accuracy score:  0.907298245614035
Classifier:  Multinomial Naive Bayes
Accuracy score:  0.8409473684210527
Classifier:  Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Accuracy score:  0.8673684210526316


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Grid search for hyperparameter tuning

In [28]:
from sklearn.model_selection import GridSearchCV
grid_svm = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

grid_adaboost = {}
grid_adaboost['n_estimators'] = [10, 50, 100, 500]
grid_adaboost['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]

grid_rf = {'max_depth': range(10,150,10), 'n_estimators':range(10,150,10), 'max_features':range(1,5)}

#### Adaboost

In [25]:
ab = AdaBoostClassifier(learning_rate=0.1,n_estimators=500)
grid_search = GridSearchCV(estimator=ab, param_grid=grid_adaboost, n_jobs=-1, cv=5, scoring='accuracy')

# execute the grid search
grid_result = grid_search.fit(Xchar, labels_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.936526 using {'learning_rate': 0.1, 'n_estimators': 500}


#### Random Forest

In [27]:
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=grid_rf, n_jobs=-1, cv=5, scoring='accuracy')

# execute the grid search
grid_result = grid_search.fit(Xstat, labels_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.902070 using {'max_depth': 10, 'max_features': 3, 'n_estimators': 40}


#### SVM

In [29]:
svm = SVC()
grid_search = GridSearchCV(estimator=svm, param_grid=grid_svm, n_jobs=-1, cv=5, scoring='accuracy')

# execute the grid search
grid_result = grid_search.fit(Xword, labels_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.896526 using {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}


## Training

In [99]:
# Linear SVM
svm = SVC(kernel="linear", C=0.1, gamma=1)
svm.fit(Xword, labels_train)
y_pred_svm = svm.predict(Xtestword)

# AdaBoost
dt = AdaBoostClassifier(learning_rate=0.1,n_estimators=500)
dt.fit(Xchar, labels_train)
y_pred_dt = dt.predict(Xtestchar)

# Random Forest
rf = RandomForestClassifier(max_depth=10, n_estimators=40, max_features=3)
rf.fit(Xstat, labels_train)
y_pred_rf = rf.predict(Xteststat)

## Majority Voting

In [100]:
summed_preds = y_pred_svm + y_pred_dt + y_pred_rf

In [101]:
y_pred_total = [0 if el < 2 else 1 for el in summed_preds ]

#### Before the GridSearch
Accuracy score : 0.9302
#### After, with test set = 0.1
Accuracy score : 0.9767
#### After, with test set = 0.2
Accuracy score : 0.9647
#### After, with test set = 0.3
Accuracy score : 0.9528
#### After, with test set = 0.4
Accuracy score : 0.9408

In [102]:
score = accuracy_score(labels_test, y_pred_total)
print('Accuracy score : {}'.format(np.round(score,4)))

Accuracy score : 0.9765


In [103]:
print(accuracy_score(labels_test,y_pred_svm))
print(accuracy_score(labels_test,y_pred_dt))
print(accuracy_score(labels_test,y_pred_rf))

0.9176470588235294
0.9647058823529412
0.8705882352941177
