In [0]:
import pandas as pd
import numpy as np
import nltk
import re
from tqdm import tqdm
from collections import Counter

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

import sklearn
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score




import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from scipy.sparse import coo_matrix

from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [0]:
!pip install transformers
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification, AdamW, BertConfig



In [0]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor
    
try:
    from google.colab import drive
    is_in_colab = True
except:
    is_in_colab = False

In [0]:
try:
    from google.colab import drive
    is_in_colab = True
    nltk.download('stopwords')
    nltk.download('wordnet')
except:
    is_in_colab = False

if is_in_colab:
    drive.mount('/content/drive')
    data_folder = r'/content/drive/My Drive/Colab/Real-or-Not/data/'
else:
    data_folder = r'./data/'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
stop_words = set(stopwords.words('english'))
data = pd.read_csv(data_folder + '/train.csv')

In [0]:
data.info()

In [0]:
data.columns

# Preprocessing functions

In [0]:
def get_hashtag_column(dataframe):
    hashtags = []
    for text in dataframe.text:
        result = re.findall('#\w+', text)
        if result != []:
            result = [w[1:].lower() for w in result]
            hashtags.append(' '.join(result))
    return hashtags

In [0]:
def lemmatize_texts(texts):
    lemmatizer = WordNetLemmatizer()
    result = []
    for t in texts:
        lemmatized_words = []
        t = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                  'url', t)
        #t = re.sub('\!+', '', t)
        #t = re.sub('\?+', '', t)
        #t = re.sub('\d+[\:|\.]?\d*\s')
#         t = re.sub('\d+', '', t)
        tokens = re.findall('''\d+,?.?\d+|\w+'\w+|#?\w+-?\w+|\w+\*+\w+''', t)
        if tokens == []:
            print('No tokens for text:\n', t)
        
        #tokens = [w.lower() for w in tokens]
        for token in tokens:
            if token.lower() not in stop_words:
                lemmatized_words.append(lemmatizer.lemmatize(token).lower())
        result.append(' '.join(lemmatized_words).replace('#', ''))
    return result

In [0]:
def tokenizer(text):
    return text.split(' ')

In [0]:
tweets = lemmatize_texts(data.text)
all_lemmatized_tokens = [w for t in tweets for w in t.split(' ')]
print('Total words: ', len(all_lemmatized_tokens))
print('Unique_words: ', len(set(all_lemmatized_tokens)))

Total words:  73941
Unique_words:  16624


In [0]:
# Most common words in dataset
freq = nltk.probability.FreqDist(all_lemmatized_tokens)
# freq.most_common(20)

In [0]:
# Most common words in real tweets
real_tweets = data[data.target == 1].text
real_tweets = lemmatize_texts(real_tweets)
freq_real = nltk.probability.FreqDist([w for t in real_tweets for w in t.split(' ')])
# freq_real.most_common(10)

In [0]:
# Most common words in fake tweets
fake_tweets = data[data.target == 0].text
fake_tweets = lemmatize_texts(fake_tweets)
freq_fake = nltk.probability.FreqDist([w for t in fake_tweets for w in t.split(' ')])
# freq_fake.most_common(10)

In [0]:
X_train, X_val, y_train, y_val = train_test_split(tweets, data.target, train_size = 0.8, random_state=42)

In [0]:
# Vectorize texts
vectorizer = CountVectorizer(ngram_range=(1,2), tokenizer=tokenizer)
train = vectorizer.fit_transform(X_train)
val = vectorizer.transform(X_val)

# Classic models

## LinearSVC

In [0]:
svc = LinearSVC(random_state=42, C=1, penalty='l2', dual=False, max_iter=1000)
svc.fit(train, y_train)
svc.score(val, y_val)

0.7859487852921865

## RandomForestClassifier

In [0]:
forest = RandomForestClassifier(random_state=42, 
                                n_estimators=500, 
                                min_samples_leaf=1, 
                                max_depth=500,
                                oob_score=True)

forest.fit(train, y_train)
print(forest.oob_score_)

0.7865353037766831


In [0]:
forest.score(val, y_val)

0.7820091923834537

In [0]:
importance = sorted(zip(vectorizer.get_feature_names(), forest.feature_importances_), key=lambda x: x[1], reverse=True)
for imp in importance[:20]: print("Feature '{}', importance={}".format(*imp))

Feature 'url', importance=0.01778135697535768
Feature 'fire', importance=0.007216360137181047
Feature 'hiroshima', importance=0.006028989100168835
Feature 'california', importance=0.004819121060211772
Feature 'killed', importance=0.004386780481565708
Feature 'suicide', importance=0.0037768614321520007
Feature 'wildfire', importance=0.0035834106387096143
Feature 'storm', importance=0.0034789825363804186
Feature 'bombing', importance=0.003337123193101085
Feature 'earthquake', importance=0.0032461968407560557
Feature 'train', importance=0.0028747654752379406
Feature 'mh370', importance=0.002562752132686677
Feature 'police', importance=0.002561265136058398
Feature 'massacre', importance=0.0024027737726525483
Feature 'japan', importance=0.002287234156945054
Feature 'drought', importance=0.0022617893703801447
Feature 'accident', importance=0.0022141630229249223
Feature 'atomic', importance=0.0021913164242427976
Feature 'evacuated', importance=0.0021098341392674393
Feature 'car', importance=0

## AdaBoostClassifier

In [0]:
boost = AdaBoostClassifier(base_estimator=LogisticRegression(), random_state=42,
                           n_estimators=2000, learning_rate=1)
boost.fit(train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=LogisticRegression(C=1.0, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     l1_ratio=None,
                                                     max_iter=100,
                                                     multi_class='auto',
                                                     n_jobs=None, penalty='l2',
                                                     random_state=None,
                                                     solver='lbfgs', tol=0.0001,
                                                     verbose=0,
                                                     warm_start=False),
                   learning_rate=1, n_estimators=2000, random_state=42)

In [0]:
boost.score(val, y_val)

0.7997373604727511

## BaggingClassifier

In [0]:
bagging = BaggingClassifier(base_estimator = LogisticRegression(), random_state=42,  
                            max_features=0.7, n_jobs=-1, 
                            max_samples=1.0, n_estimators=2000)
bagging.fit(train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(C=1.0, class_weight=None,
                                                    dual=False,
                                                    fit_intercept=True,
                                                    intercept_scaling=1,
                                                    l1_ratio=None, max_iter=100,
                                                    multi_class='auto',
                                                    n_jobs=None, penalty='l2',
                                                    random_state=None,
                                                    solver='lbfgs', tol=0.0001,
                                                    verbose=0,
                                                    warm_start=False),
                  bootstrap=True, bootstrap_features=False, max_features=0.7,
                  max_samples=1.0, n_estimators=2000, n_jobs=-1,
                  oob_score=False, random_state=42, v

In [0]:
bagging.score(val, y_val)

0.8082731451083388

# Neural Network

In [0]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device('cuda:0')
    from torch.cuda import FloatTensor, LongTensor
else:
    device = torch.device('cpu')
    from torch import FloatTensor, LongTensor

In [0]:
def fit(model, loss_function, train_data=None, val_data=None, optimizer=None,
        epoch_count=1, batch_size=1, scheduler=None, alpha=1, bert=False):
    train_history = []
    val_history = []
    best_model = None
    for epoch in range(epoch_count):
            name_prefix = '[{} / {}] '.format(epoch + 1, epoch_count)
            epoch_train_score = 0
            epoch_val_score = 0
            
            if train_data:
                epoch_train_score = do_epoch(model, loss_function, train_data, batch_size, 
                                              optimizer, name_prefix + 'Train:', alpha=alpha, bert=bert,
                                             scheduler=scheduler)
                train_history.append(epoch_train_score)

            if val_data:
                name = '  Val:'
                if not train_data:
                    name = ' Test:'
                epoch_val_score = do_epoch(model, loss_function, val_data, batch_size, 
                                             optimizer=None, name=name_prefix + name, alpha=alpha, bert=bert,
                                           scheduler=scheduler)
                
                val_history.append(epoch_val_score)

    return train_history, val_history
    

In [0]:
def do_epoch(model, loss_function, data, batch_size, optimizer=None, name=None, alpha=1, bert=False, scheduler=None):
    """
       Генерация одной эпохи
    """
    accuracy = 0
    epoch_loss = 0
   
    batch_count = len(data)
   
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batch_count) as progress_bar:               
            for ind, batch in enumerate(data):
                if bert:
                  X_batch, X_mask, y_batch =  batch[0].to(device), batch[1].to(device), batch[2].to(device)
                  loss, prediction = model(X_batch, token_type_ids=None, attention_mask=X_mask, labels=y_batch)
                else:
                  X_batch, y_batch = batch[0].to(device), batch[1].to(device)
                  prediction = model(X_batch)
                  loss = loss_function(prediction, y_batch)

                  for param in model.children():
                    if type(param) == nn.Linear:
                        loss += alpha * torch.abs(param.weight).sum()

                epoch_loss += loss.item()

                true_indices = torch.argmax(prediction, dim=1)
                correct_samples = torch.sum(true_indices == y_batch).cpu().numpy()
                accuracy += correct_samples / y_batch.shape[0]

                if is_train:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    if scheduler: scheduler.step(accuracy)
              
                progress_bar.update()
                progress_bar.set_description('Epoch {} - accuracy: {:.2f}, loss {:.2f}'.format(
                    name, (accuracy / (ind+1)), epoch_loss / (ind+1))
                )
            
            accuracy /= (ind + 1)
            epoch_loss /= (ind + 1) 
            progress_bar.set_description(f'Epoch {name} - accuracy: {accuracy:.2f}, loss: {epoch_loss:.2f}')

    return accuracy

## BERT

In [0]:
# Preparing data
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
def prepare_data_for_bert(texts):
  MAX_LEN = 0
  input_ids = []
  attention_masks = []
  for tweet in texts:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_tweet = bert_tokenizer.encode(
                        tweet,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.
    input_ids.append(encoded_tweet)

    if len(encoded_tweet) > MAX_LEN:
      MAX_LEN = len(encoded_tweet)

  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                        value=0, truncating="post", padding="post")
  
  # Make attention masks token -> 1, [PAD] -> 0
  for tweet in input_ids:
    att_mask = [int(token_id > 0) for token_id in tweet]
    attention_masks.append(att_mask)
    
  return input_ids, attention_masks


In [0]:
input_ids, attention_masks = prepare_data_for_bert(tweets)
labels = data.target.values

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.25)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.25)



In [0]:
batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(torch.tensor(train_inputs), torch.tensor(train_masks), torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(torch.tensor(validation_inputs), torch.tensor(validation_masks), torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
# Load model

bert = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

bert.cuda()

In [0]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# AdamW is a class from the huggingface library
optimizer = AdamW(bert.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


In [0]:
train_history_bert, val_history_ bert = fit(model, loss_function=None, train_data=train_dataloader, val_data=validation_dataloader, optimizer=optimizer, epoch_count=3, batch_size=32, scheduler=scheduler, alpha=1, bert=True)

Epoch [1 / 3] Train: - accuracy: 0.78, loss: 0.48: 100%|██████████| 357/357 [01:30<00:00,  4.20it/s]
Epoch [1 / 3]   Val: - accuracy: 0.81, loss: 0.43: 100%|██████████| 119/119 [00:08<00:00, 14.56it/s]
Epoch [2 / 3] Train: - accuracy: 0.84, loss: 0.40: 100%|██████████| 357/357 [01:29<00:00,  4.18it/s]
Epoch [2 / 3]   Val: - accuracy: 0.82, loss: 0.44: 100%|██████████| 119/119 [00:08<00:00, 14.51it/s]
Epoch [3 / 3] Train: - accuracy: 0.86, loss: 0.35: 100%|██████████| 357/357 [01:29<00:00,  4.18it/s]
Epoch [3 / 3]   Val: - accuracy: 0.83, loss: 0.43: 100%|██████████| 119/119 [00:08<00:00, 14.48it/s]


([0.7777149321266968, 0.8355284421460892, 0.8612233354880414],
 [0.8067226890756303, 0.8151260504201681, 0.8261554621848739])

## LinearNN

In [0]:
class NNModel():
    def __init__(self, model):
        self.model = model
        
    def predict(self, inputs):
        self.model.eval()
        output = pd.DataFrame()
        for ind in range(inputs.shape[0]):
            X = FloatTensor(inputs[ind].toarray())
            predict = self.model(X)
            true_indices = torch.argmax(predict, dim=1).detach().cpu().numpy()
            output.loc[ind, 'target'] = true_indices
        return output.values

In [0]:
# fit settings
batch_size = 100
epoch_count = 10

# optim settings
learning_rate = 1e-4
weight_decay = 0.1
alpha = 0.005

# model settings
linear1_out = int(train.shape[1]**0.5)
output = 2
dropout = 0.3

# scheduler settings
factor = 0.5
patience = 3
threshold = 1e-2

model = nn.Sequential(nn.Linear(train.shape[1], linear1_out),
                      nn.BatchNorm1d(linear1_out),
#                       nn.Dropout(p=dropout, inplace=True),
                      nn.ReLU(inplace=True),
                      nn.Linear(linear1_out, output),
                      nn.ReLU(inplace=True)
                     ).to(device)

loss_function = nn.CrossEntropyLoss()

optimizer = optim.Adam(
                        model.parameters(),
                        lr=learning_rate, 
                        weight_decay=weight_decay
                    )

scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=factor, 
                              patience=patience, verbose=True, threshold=threshold
                              )


# Create the DataLoader for our training set.
train_data = TensorDataset(torch.FloatTensor(train.toarray()), torch.tensor(np.array(y_train)))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(torch.FloatTensor(val.toarray()), torch.tensor(np.array(y_val)))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
fit(model, loss_function, train_dataloader, validation_dataloader, optimizer, epoch_count, 100, scheduler=None, alpha=alpha)

Epoch [1 / 10] Train: - accuracy: 0.61, loss: 38.18: 100%|██████████| 61/61 [00:01<00:00, 47.94it/s]
Epoch [1 / 10]   Val: - accuracy: 0.67, loss: 3.85: 100%|██████████| 16/16 [00:00<00:00, 59.10it/s]
Epoch [2 / 10] Train: - accuracy: 0.71, loss: 2.48: 100%|██████████| 61/61 [00:01<00:00, 48.58it/s]
Epoch [2 / 10]   Val: - accuracy: 0.78, loss: 2.32: 100%|██████████| 16/16 [00:00<00:00, 57.20it/s]
Epoch [3 / 10] Train: - accuracy: 0.79, loss: 2.30: 100%|██████████| 61/61 [00:01<00:00, 48.15it/s]
Epoch [3 / 10]   Val: - accuracy: 0.79, loss: 2.44: 100%|██████████| 16/16 [00:00<00:00, 59.25it/s]
Epoch [4 / 10] Train: - accuracy: 0.81, loss: 2.31: 100%|██████████| 61/61 [00:01<00:00, 48.41it/s]
Epoch [4 / 10]   Val: - accuracy: 0.79, loss: 2.46: 100%|██████████| 16/16 [00:00<00:00, 59.95it/s]
Epoch [5 / 10] Train: - accuracy: 0.81, loss: 2.25: 100%|██████████| 61/61 [00:01<00:00, 48.24it/s]
Epoch [5 / 10]   Val: - accuracy: 0.79, loss: 2.32: 100%|██████████| 16/16 [00:00<00:00, 58.34it/s]

([0.6145173041894355,
  0.7060473588342443,
  0.7935519125683059,
  0.805865209471767,
  0.8138251366120216,
  0.8209653916211292,
  0.8279417122040071,
  0.8305828779599271,
  0.8367941712204005,
  0.8399453551912568],
 [0.6655706521739131,
  0.7833152173913044,
  0.7872554347826087,
  0.7914130434782609,
  0.7870380434782609,
  0.7937228260869564,
  0.7868206521739132,
  0.789320652173913,
  0.788913043478261,
  0.7886956521739129])

# Mixed models

## Create ensemble

In [0]:
models = []
models.append(bagging)
models.append(forest)
models.append(boost)
models.append(svc)
models.append(NNModel(model))

In [0]:
def ensemble(models, data):
    predicts = pd.DataFrame()
    for i, model in enumerate(models):
        predicts[i] = model.predict(data)
    result = predicts.apply(lambda row: row.value_counts().index[0], axis=1)
    return result.values

In [0]:
accuracy_score(ensemble(models, val_tf), y_val)

In [0]:
accuracy_score(ensemble(models, test_tf), y_test)

# Submission

In [0]:
# choose your best model
final_model = NNModel(model)

In [0]:
def submission(model, vectorizer, file_name="submission.csv"):
    test_data = pd.read_csv(data_folder + "/test.csv")
    all_lemmatized_texts = lemmatize_texts(test_data.text)
    test = vectorizer.transform(all_lemmatized_texts)
    submit = pd.DataFrame()
    submit['id'] = test_data['id']
    submit['target'] = final_model.predict(test)
    submit['target'] = submit['target'].astype('int')
    submit.to_csv(data_folder + file_name, index=False)
    
submission(final_model, vectorizer)