In [1]:
from collections import Counter
from functools import reduce
import os.path as osp
import ast
import math
from scipy import stats
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import normalize
from scipy.sparse import coo_matrix
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [3]:
import numpy as np
import pandas as pd
import nltk
import unicodedata
import re
import numpy as np
from numpy import dot
from numpy.linalg import norm
from gensim.models import Word2Vec
from nltk import word_tokenize
from scipy.sparse import hstack
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/alexch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alexch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
train_df = pd.read_csv("../input/train.csv")
valid_df = pd.read_csv("../input/valid.csv")
test_df = pd.read_csv("../input/test.csv")
trainval_df = train_df.append(valid_df, ignore_index=True)
sample_df = pd.read_csv('../input/submission_sample.csv')

In [5]:
max_context = 0
for i, row in train_df.append(valid_df, ignore_index=True).append(test_df, ignore_index=True).iterrows():
    if len(ast.literal_eval(row['stance_label'])) > max_context:
        max_context = len(ast.literal_eval(row['stance_label']))

In [6]:
stance_encoder = {'OPPOSE': -1, 'NULL': 0, 'SUPPORT': 1}
stance_decoder = {-1: 'OPPOSE', 0: 'NULL', 1: 'SUPPORT'}
impact_encoder = {'UNKNOWN': -1, 'NOT_IMPACTFUL': 0, 'MEDIUM_IMPACT': 1, 'IMPACTFUL': 2}
impact_decoder = {-1: 'UNKNOWN', 0: 'NOT_IMPACTFUL', 1: 'MEDIUM_IMPACT', 2: 'IMPACTFUL'}

In [7]:
def feature_eng(text, stance):
    stance[0] = 'SUPPORT'
    stance_embedded = []
    for i, sentence in enumerate(text):
        word_embedded = []
        for word in sentence.split(' '):
            word_embedded.append(word + stance[i])
            
        stance_embedded.append(' '.join(word_embedded))
    return stance_embedded

def seperate_data(df):
    index = []
    text = []
    feats = []
    stance_label = []
    to_topic_stance_label = []
    impact_label = []
    for i, row in df.iterrows():
        index.append(row['id'])
        example_text = [row['text']] + ast.literal_eval(row['context'])     
        stance_str = ['SUPPORT'] + ast.literal_eval(row['stance_label'])[1:]
        to_topic_stance_str = ['SUPPORT', stance_str[1]]
        for i in range(2, len(stance_str)):
            if stance_str[i] == 'SUPPORT':
                to_topic_stance_str.append(to_topic_stance_str[-1])
            else:
                to_topic_stance_str.append(
                    stance_decoder[stance_encoder[to_topic_stance_str[-1]] * -1])
                
        feat = feature_eng(example_text, stance_str)
        
        text.append(example_text)
        feats.append(feat)
        stance_label.append(' '.join(stance_str))
        to_topic_stance_label.append(' '.join(to_topic_stance_str))
        impact_label.append(impact_encoder[row['impact_label']])
    
    text = [" ".join(sentence) for sentence in text]
    return np.array(index), np.array(text), np.array(to_topic_stance_label), np.array(impact_label)

def context_length_feat(df, max_context=24):
    length_feat = np.zeros((len(df), max_context))
    for i, row in df.iterrows():
        context = ast.literal_eval(row['context'])
        for j, text in enumerate(context):
            length_feat[i][j] = len(text.split(' '))
    
    return length_feat
        

In [8]:
train_ids, train_texts, train_stance, train_labels = seperate_data(train_df)
valid_ids, valid_texts, valid_stance, valid_labels = seperate_data(valid_df)
trainval_ids, trainval_texts, trainval_stance, trainval_labels = seperate_data(trainval_df)
test_ids, test_texts, test_stance, test_labels = seperate_data(test_df)

In [9]:
train_text_length = context_length_feat(train_df, max_context=max_context)
valid_text_length = context_length_feat(valid_df, max_context=max_context)
trainval_text_length = context_length_feat(trainval_df, max_context=max_context)
test_text_length = context_length_feat(test_df, max_context=max_context)


In [10]:
class_names = train_df.impact_label.unique().tolist()

In [11]:
text_count_vect = CountVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 4), min_df=0.006)
stance_count_vect = CountVectorizer(lowercase=True, ngram_range=(1, 8))

X_train = text_count_vect.fit_transform(train_texts)
X_valid = text_count_vect.transform(valid_texts)
# X_trainval = text_count_vect.transform(trainval_texts)
X_test = text_count_vect.transform(test_texts)

train_stance_vec = stance_count_vect.fit_transform(train_stance)
valid_stance_vec = stance_count_vect.transform(valid_stance)
# trainval_stance_vec = stance_count_vect.transform(trainval_stance)
test_stance_vec = stance_count_vect.transform(test_stance)

X_train = hstack((X_train, train_stance_vec))
X_valid = hstack((X_valid, valid_stance_vec))
# X_trainval = hstack((X_trainval, trainval_stance_vec))
X_test = hstack((X_test, test_stance_vec))

In [12]:
X_train.shape

(5170, 3125)

In [13]:
# single model
best_f1 = 0
for seed in range(10000, 10005):
    model = RandomForestClassifier(n_estimators=80, random_state=20210402 + seed)
    model.fit(X_train, train_labels)
    valid_pred = model.predict(X_valid)
    f1_score = metrics.f1_score(valid_labels, valid_pred, average='macro')
    print("RF F-1 score:", f1_score, Counter(valid_pred))
    test_pred = model.predict(X_test)

RF F-1 score: 0.5949173273462418 Counter({2: 755, 1: 191, 0: 162})
RF F-1 score: 0.5928309967276545 Counter({2: 748, 1: 197, 0: 163})
RF F-1 score: 0.6140307516113676 Counter({2: 747, 1: 200, 0: 161})
RF F-1 score: 0.6029671976315153 Counter({2: 741, 1: 199, 0: 168})
RF F-1 score: 0.6168451717364761 Counter({2: 739, 1: 201, 0: 168})


In [14]:
text_count_vect = CountVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 4), min_df=0.008)
stance_count_vect = CountVectorizer(lowercase=True, ngram_range=(1, 8))

trainval_stance_vec = stance_count_vect.fit_transform(trainval_stance)
test_stance_vec = stance_count_vect.transform(test_stance)

X_trainval = text_count_vect.fit_transform(trainval_texts)
X_test = text_count_vect.transform(test_texts)

X_trainval = hstack((X_trainval, trainval_stance_vec))
X_test = hstack((X_test, test_stance_vec))

In [15]:
model = RandomForestClassifier(n_estimators=80, random_state=12345679)
model.fit(X_trainval, trainval_labels)
test_pred = model.predict(X_test)
sample_df['pred'] = test_pred
print(Counter(test_pred))

Counter({2: 749, 1: 195, 0: 164})


In [16]:
sample_df.to_csv('rfc.csv', index=False)
submission_df = pd.read_csv('rfc.csv')
Counter(submission_df['pred'].values)

Counter({2: 749, 1: 195, 0: 164})

In [17]:
def ensemble_kfolds(kfolds_pred):
    m = stats.mode(fkold_test_pred, axis=0)[0][0]
    ensembled = m
    for i, ps in enumerate(kfolds_pred.T):
        if len(ps[ps == m[i]]) <= 2:
            if 0 in ps and 2 in ps:
                print(ps)
                ensembled[i] = 1
            
    return ensembled

In [18]:

fkold_test_pred = []
skf = StratifiedKFold(n_splits=5, random_state=20210402, shuffle=True)
for i, (train_index, valid_index) in enumerate(skf.split(trainval_texts, trainval_labels)):
    train_texts = trainval_texts[train_index]
    train_stance = trainval_stance[train_index]
    train_labels = trainval_labels[train_index]

    valid_texts = trainval_texts[valid_index]
    valid_stance = trainval_stance[valid_index]
    valid_labels = trainval_labels[valid_index]

    text_count_vect = CountVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 4), min_df=0.006)
    stance_count_vect = CountVectorizer(lowercase=True, ngram_range=(1, 8))
    
    train_stance_vec = stance_count_vect.fit_transform(train_stance)
    valid_stance_vec = stance_count_vect.transform(valid_stance)
    test_stance_vec = stance_count_vect.transform(test_stance)
    
    X_train = text_count_vect.fit_transform(train_texts)
    X_valid = text_count_vect.transform(valid_texts)
    X_test = text_count_vect.transform(test_texts)

    X_train = hstack((X_train, train_stance_vec))
    X_valid = hstack((X_valid, valid_stance_vec))
    X_test = hstack((X_test, test_stance_vec))
    
    model = RandomForestClassifier(n_estimators=70, random_state=20210402 + i)
    model.fit(X_train, train_labels)
    valid_pred = model.predict(X_valid)
    print("fold", i)
    print(f"RF F-1 score: {metrics.f1_score(valid_labels, valid_pred, average='macro')}")
    
    test_pred = model.predict(X_test)
    fkold_test_pred.append(test_pred)

fkold_test_pred = np.array(fkold_test_pred)

fold 0
RF F-1 score: 0.5974081944719485
fold 1
RF F-1 score: 0.606731746867934
fold 2
RF F-1 score: 0.5769031216150579
fold 3
RF F-1 score: 0.5900896544312236
fold 4
RF F-1 score: 0.6041251480821814


In [19]:
test_pred = ensemble_kfolds(fkold_test_pred)
print(Counter(test_pred))

[2 1 1 2 0]
[2 0 1 2 0]
[1 2 2 0 1]
[1 1 2 0 0]
[2 1 2 0 0]
[1 2 1 0 2]
Counter({2: 748, 1: 195, 0: 165})


In [20]:
sample_df['pred'] = test_pred
sample_df.to_csv('rfc.csv', index=False)