In [1]:
from collections import Counter
from functools import reduce
import os.path as osp
import ast
import math
from scipy import stats
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import normalize
from scipy.sparse import coo_matrix
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [3]:
import numpy as np
import pandas as pd
import nltk
import unicodedata
import re
import numpy as np
from numpy import dot
from numpy.linalg import norm
from gensim.models import Word2Vec
from nltk import word_tokenize
from scipy.sparse import hstack
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/alexch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alexch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
train_df = pd.read_csv("../input/train.csv")
valid_df = pd.read_csv("../input/valid.csv")
test_df = pd.read_csv("../input/test.csv")
trainval_df = train_df.append(valid_df, ignore_index=True)
sample_df = pd.read_csv('../input/submission_sample.csv')

In [5]:
stance_encoder = {'OPPOSE': -1, 'NULL': 0, 'SUPPORT': 1}
stance_decoder = {-1: 'OPPOSE', 0: 'NULL', 1: 'SUPPORT'}
impact_encoder = {'UNKNOWN': -1, 'NOT_IMPACTFUL': 0, 'MEDIUM_IMPACT': 1, 'IMPACTFUL': 2}
impact_decoder = {-1: 'UNKNOWN', 0: 'NOT_IMPACTFUL', 1: 'MEDIUM_IMPACT', 2: 'IMPACTFUL'}

In [6]:
def seperate_data(df):
    index = []
    text = []
    feats = []
    stance_label = []
    to_topic_stance_label = []
    impact_label = []
    for i, row in df.iterrows():
        index.append(row['id'])
        example_text = [row['text']] + ast.literal_eval(row['context'])
        stance_str = ['SUPPORT'] + ast.literal_eval(row['stance_label'])[1:]
        to_topic_stance_str = ['SUPPORT', stance_str[1]]
        for i in range(2, len(stance_str)):
            if stance_str[i] == 'SUPPORT':
                to_topic_stance_str.append(to_topic_stance_str[-1])
            else:
                to_topic_stance_str.append(
                    stance_decoder[stance_encoder[to_topic_stance_str[-1]] * -1])
                        
        text.append(example_text)
        stance_label.append(' '.join(stance_str))
        to_topic_stance_label.append(' '.join(to_topic_stance_str))
        impact_label.append(impact_encoder[row['impact_label']])
    
    text = [" ".join(sentence) for sentence in text]
    return np.array(index), np.array(text), np.array(to_topic_stance_label), np.array(impact_label)


In [7]:
train_ids, train_texts, train_stance, train_labels = seperate_data(train_df)
valid_ids, valid_texts, valid_stance, valid_labels = seperate_data(valid_df)
trainval_ids, trainval_texts, trainval_stance, trainval_labels = seperate_data(trainval_df)
test_ids, test_texts, test_stance, test_labels = seperate_data(test_df)

In [8]:
def ensemble_kfolds(kfolds_pred):
    m = stats.mode(fkold_test_pred, axis=0)[0][0]
    ensembled = m
    for i, ps in enumerate(kfolds_pred.T):
        if len(ps[ps == m[i]]) <= 2:
            if 0 in ps and 2 in ps:
                print(ps)
                ensembled[i] = 1
            
    return ensembled

In [9]:

fkold_test_pred = []
skf = StratifiedKFold(n_splits=5, random_state=20210402, shuffle=True)
for i, (train_index, valid_index) in enumerate(skf.split(trainval_texts, trainval_labels)):
    train_texts = trainval_texts[train_index]
    train_stance = trainval_stance[train_index]
    train_labels = trainval_labels[train_index]

    valid_texts = trainval_texts[valid_index]
    valid_stance = trainval_stance[valid_index]
    valid_labels = trainval_labels[valid_index]

    text_count_vect = CountVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 4), min_df=0.006)
    stance_count_vect = CountVectorizer(lowercase=True, ngram_range=(1, 8))
    
    train_stance_vec = stance_count_vect.fit_transform(train_stance)
    valid_stance_vec = stance_count_vect.transform(valid_stance)
    test_stance_vec = stance_count_vect.transform(test_stance)
    
    X_train = text_count_vect.fit_transform(train_texts)
    X_valid = text_count_vect.transform(valid_texts)
    X_test = text_count_vect.transform(test_texts)

    X_train = hstack((X_train, train_stance_vec))
    X_valid = hstack((X_valid, valid_stance_vec))
    X_test = hstack((X_test, test_stance_vec))
    
    model = RandomForestClassifier(n_estimators=70, random_state=20210402 + i)
    model.fit(X_train, train_labels)
    valid_pred = model.predict(X_valid)
    print("fold", i)
    print(f"RF F-1 score: {metrics.f1_score(valid_labels, valid_pred, average='macro')}")
    
    test_pred = model.predict(X_test)
    fkold_test_pred.append(test_pred)

fkold_test_pred = np.array(fkold_test_pred)

fold 0
RF F-1 score: 0.5974081944719485
fold 1
RF F-1 score: 0.606731746867934
fold 2
RF F-1 score: 0.5769031216150579
fold 3
RF F-1 score: 0.5900896544312236
fold 4
RF F-1 score: 0.6041251480821814


In [10]:
test_pred = ensemble_kfolds(fkold_test_pred)
print(Counter(test_pred))

[2 1 1 2 0]
[2 0 1 2 0]
[1 2 2 0 1]
[1 1 2 0 0]
[2 1 2 0 0]
[1 2 1 0 2]
Counter({2: 748, 1: 195, 0: 165})


In [11]:
sample_df['pred'] = test_pred
sample_df.to_csv('rfc.csv', index=False)