In [1]:
from collections import Counter
from functools import reduce
import os.path as osp
import ast
import math
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import coo_matrix
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

In [2]:
import numpy as np
import pandas as pd
import nltk
import unicodedata
import re
import numpy as np
from numpy import dot
from numpy.linalg import norm
from gensim.models import Word2Vec
from nltk import word_tokenize
from scipy.sparse import hstack
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/alexch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alexch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
train_df = pd.read_csv("../input/train.csv")
valid_df = pd.read_csv("../input/valid.csv")
test_df = pd.read_csv("../input/test.csv")
trainval_df = train_df.append(valid_df, ignore_index=True)
sample_df = pd.read_csv('../input/submission_sample.csv')

In [4]:
stance_encoder = {'OPPOSE': -1, 'NULL': 0, 'SUPPORT': 1}
stance_decoder = {-1: 'OPPOSE', 0: 'NULL', 1: 'SUPPORT'}
impact_encoder = {'UNKNOWN': -1, 'NOT_IMPACTFUL': 0, 'MEDIUM_IMPACT': 1, 'IMPACTFUL': 2}
impact_decoder = {-1: 'UNKNOWN', 0: 'NOT_IMPACTFUL', 1: 'MEDIUM_IMPACT', 2: 'IMPACTFUL'}

In [5]:
def seperate_data(df):
    index = []
    text = []
    stance_label = []
    impact_label = []
    for i, row in df.iterrows():
        index.append(row['id'])
        example_text = [row['text']] + ast.literal_eval(row['context'])     
        stance_str = " ".join(ast.literal_eval(row['stance_label'])[1:])
        
        text.append(example_text)
        stance_label.append(stance_str)
        impact_label.append(impact_encoder[row['impact_label']])
    
    text = [" ".join(sentence) for sentence in text]
    return np.array(index), np.array(text), np.array(stance_label), np.array(impact_label)

In [6]:
train_ids, train_texts, train_stance, train_labels = seperate_data(train_df)
valid_ids, valid_texts, valid_stance, valid_labels = seperate_data(valid_df)
trainval_ids, trainval_texts, trainval_stance, trainval_labels = seperate_data(trainval_df)
test_ids, test_texts, test_stance, test_labels = seperate_data(test_df)

In [7]:
class_names = train_df.impact_label.unique().tolist()

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [9]:
# text_count_vect = CountVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 4), min_df=0.005)
# stance_count_vect = CountVectorizer(lowercase=True, ngram_range=(1, 5))

In [10]:
# X_train = text_count_vect.fit_transform(train_texts)
# X_valid = text_count_vect.transform(valid_texts)
# X_trainval = text_count_vect.transform(trainval_texts)
# X_test = text_count_vect.transform(test_texts)

In [11]:
# train_context_count_vect = context_count_vect.fit_transform(df_train['context'].to_numpy())
# valid_context_count_vect = context_count_vect.transform(df_valid['context'].to_numpy())
# test_context_count_vect = context_count_vect.transform(df_test['context'].to_numpy())

In [12]:
# train_stance_vec = stance_count_vect.fit_transform(train_stance)
# valid_stance_vec = stance_count_vect.transform(valid_stance)
# trainval_stance_vec = stance_count_vect.transform(trainval_stance)
# test_stance_vec = stance_count_vect.transform(test_stance)

In [13]:
# tfidf_text = TfidfVectorizer(max_features = 10000, ngram_range=(1,4))
# tfidf_context = TfidfVectorizer(max_features = 10000, ngram_range=(1,4))

In [14]:
# train_text_tfidf_vect = tfidf_text.fit_transform(train_texts)
# valid_text_tfidf_vect = tfidf_text.transform(valid_texts)
# trainval_text_tfidf_vect = tfidf_text.fit_transform(trainval_texts)
# test_text_tfidf_vect = tfidf_text.transform(test_texts)

In [15]:
# train_context_tfidf_vect = tfidf_context.fit_transform(df_train['context'].to_numpy())
# valid_context_tfidf_vect = tfidf_context.transform(df_valid['context'].to_numpy())

In [16]:
# X_train = hstack((X_train, train_context_count_vect))
# X_valid = hstack((X_valid, valid_context_count_vect))
# # X_test = hstack((X_test, test_context_count_vect))

# X_train = hstack((X_train, train_stance_vec))
# X_valid = hstack((X_valid, valid_stance_vec))
# X_trainval = hstack((X_trainval, trainval_stance_vec))
# X_test = hstack((X_test, test_stance_vec))

# X_train = hstack((X_train, train_text_tfidf_vect))
# X_valid = hstack((X_valid, valid_text_tfidf_vect))

# X_train = hstack((X_train, train_context_tfidf_vect))
# X_valid = hstack((X_valid, valid_context_tfidf_vect))

In [17]:
fkold_test_pred = []
skf = StratifiedKFold(n_splits=5)
for i, (train_index, valid_index) in enumerate(skf.split(trainval_texts, trainval_labels)):
    train_texts = trainval_texts[train_index]
    train_stance = trainval_stance[train_index]
    train_labels = trainval_labels[train_index]

    valid_texts = trainval_texts[valid_index]
    valid_stance = trainval_stance[valid_index]
    valid_labels = trainval_labels[valid_index]

    text_count_vect = CountVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 4), min_df=0.005)
    stance_count_vect = CountVectorizer(lowercase=True, ngram_range=(1, 5))
    
    train_stance_vec = stance_count_vect.fit_transform(train_stance)
    valid_stance_vec = stance_count_vect.transform(valid_stance)
    test_stance_vec = stance_count_vect.transform(test_stance)
    
    X_train = text_count_vect.fit_transform(train_texts)
    X_valid = text_count_vect.transform(valid_texts)
    X_test = text_count_vect.transform(test_texts)

    X_train = hstack((X_train, train_stance_vec))
    X_valid = hstack((X_valid, valid_stance_vec))
    X_test = hstack((X_test, test_stance_vec))
    
    rfc = RandomForestClassifier(random_state=20210402)
    rfc.fit(X_train, train_labels)
    valid_pred = rfc.predict(X_valid)
    print(f"RF F-1 score: {metrics.f1_score(valid_labels, valid_pred, average='macro')}")
    
    test_pred = rfc.predict(X_test)
    fkold_test_pred.append(test_pred)
    

RF F-1 score: 0.5834223519572839
RF F-1 score: 0.598300844845681
RF F-1 score: 0.5713338170367855
RF F-1 score: 0.594878686255529
RF F-1 score: 0.5821078061799164


In [18]:
from scipy import stats
test_pred_mode = stats.mode(fkold_test_pred)
print(Counter(test_pred_mode[0][0]))

Counter({2: 764, 1: 185, 0: 159})


In [19]:
sample_df['pred'] = test_pred_mode[0][0]
sample_df.to_csv('./rfc.csv', index=False)

In [20]:
# text_clf.fit(df_train.text.to_numpy(), df_train.impact_label.to_numpy())

In [None]:
# validation_pred = text_clf.predict(df_valid.text.to_numpy())

In [None]:
# print(f"F-1 score: {metrics.f1_score(df_valid.impact_label.to_numpy(), validation_pred, average='macro')}")

In [None]:
# metrics.classification_report(df_valid.impact_label.to_numpy(), validation_pred, target_names=class_names)

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
parameters = {
    'n_estimators': [1000, 1200, 1500],
    'max_depth': [16, 20, 24],
}
text_clf = RandomForestClassifier(random_state=20210402)
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1, verbose=1)

In [30]:
#gs_clf.fit(train_df.text.to_numpy(), train_df.impact_label.to_numpy())

In [31]:
#gs_clf.cv_results_

In [32]:
#gs_clf.fit(df_train.text.to_numpy(), df_train.impact_label.to_numpy())

In [33]:
#gs_clf.cv_results_

In [34]:
#predictions = sgd.predict(X_test)

In [35]:
#submission = df_test[['id']]

In [36]:
#submission['pred'] = predictions

In [37]:
#submission.to_csv('./sgd.csv', index=False)