In [None]:
!pip install tldextract
!pip install sweetviz

In [None]:
# Importing Packages
import pandas as pd
import numpy as np
import spacy
import sys
sys.path = [
    '../input/readability-package/',
] + sys.path
import readability
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, pos_tag_sents
from urllib.parse import urlparse
import re
from tldextract import extract

from sklearn import metrics, preprocessing, model_selection
import lightgbm as lgb
import copy
import sweetviz as sv
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import random

In [None]:
# Reading Data
pd.options.display.max_rows = 4000
train = pd.read_csv('../input/commonlitreadabilityprize/train.csv', low_memory=False)
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv', low_memory=False)

In [None]:
# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def readability_measurements(passage: str):
    """
    This function uses the readability library for feature engineering.
    It includes textual statistics, readability scales and metric, and some pos stats
    """
    results = readability.getmeasures(passage, lang='en')
    
    chars_per_word = results['sentence info']['characters_per_word']
    syll_per_word = results['sentence info']['syll_per_word']
    words_per_sent = results['sentence info']['words_per_sentence']
    
    kincaid = results['readability grades']['Kincaid']
    ari = results['readability grades']['ARI']
    coleman_liau = results['readability grades']['Coleman-Liau']
    flesch = results['readability grades']['FleschReadingEase']
    gunning_fog = results['readability grades']['GunningFogIndex']
    lix = results['readability grades']['LIX']
    smog = results['readability grades']['SMOGIndex']
    rix = results['readability grades']['RIX']
    dale_chall = results['readability grades']['DaleChallIndex']
    
    tobeverb = results['word usage']['tobeverb']
    auxverb = results['word usage']['auxverb']
    conjunction = results['word usage']['conjunction']
    pronoun = results['word usage']['pronoun']
    preposition = results['word usage']['preposition']
    nominalization = results['word usage']['nominalization']
    
    pronoun_b = results['sentence beginnings']['pronoun']
    interrogative = results['sentence beginnings']['interrogative']
    article = results['sentence beginnings']['article']
    subordination = results['sentence beginnings']['subordination']
    conjunction_b = results['sentence beginnings']['conjunction']
    preposition_b = results['sentence beginnings']['preposition']

    
    return [chars_per_word, syll_per_word, words_per_sent,
            kincaid, ari, coleman_liau, flesch, gunning_fog, lix, smog, rix, dale_chall,
            tobeverb, auxverb, conjunction, pronoun, preposition, nominalization,
            pronoun_b, interrogative, article, subordination, conjunction_b, preposition_b]

# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def spacy_features(df: pd.DataFrame):
    """
    This function generates features using spacy en_core_wb_lg
    I learned about this from these resources:
    https://www.kaggle.com/konradb/linear-baseline-with-cv
    https://www.kaggle.com/anaverageengineer/comlrp-baseline-for-complete-beginners
    """
    
    nlp = spacy.load('en_core_web_lg')
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in df.excerpt])
        
    return vectors

def get_spacy_col_names():
    names = list()
    for i in range(300):
        names.append(f"spacy_{i}")
        
    return names

# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def pos_tag_features(passage: str):
    """
    This function counts the number of times different parts of speech occur in an excerpt
    """
    pos_tags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"]
    
    tags = pos_tag(word_tokenize(passage))
    tag_list= list()
    
    for tag in pos_tags:
        tag_list.append(len([i[0] for i in tags if i[1] == tag]))
    
    return tag_list

# Taken this from https://www.kaggle.com/ravishah1/readability-feature-engineering-non-nn-baseline
def generate_other_features(passage: str):
    """
    This function is where I test miscellaneous features
    This is experimental
    """
    # punctuation count
    periods = passage.count(".")
    commas = passage.count(",")
    semis = passage.count(";")
    exclaims = passage.count("!")
    questions = passage.count("?")
    
    # Some other stats
    num_char = len(passage)
    num_words = len(passage.split(" "))
    unique_words = len(set(passage.split(" ") ))
    word_diversity = unique_words/num_words
    
    word_len = [len(w) for w in passage.split(" ")]
    longest_word = np.max(word_len)
    avg_len_word = np.mean(word_len)
    
    return [periods, commas, semis, exclaims, questions,
            num_char, num_words, unique_words, word_diversity,
            longest_word, avg_len_word]

def extract_features(df):

    scores_df = pd.DataFrame(df["excerpt"].apply(lambda p : readability_measurements(p)).tolist(), 
                                 columns=["chars_per_word", "syll_per_word", "words_per_sent",
                                          "kincaid", "ari", "coleman_liau", "flesch", "gunning_fog", "lix", "smog", "rix", "dale_chall",
                                          "tobeverb", "auxverb", "conjunction", "pronoun", "preposition", "nominalization",
                                          "pronoun_b", "interrogative", "article", "subordination", "conjunction_b", "preposition_b"])
    df = pd.merge(df, scores_df, left_index=True, right_index=True)
    
    spacy_df = pd.DataFrame(spacy_features(df), columns=get_spacy_col_names())
    df = pd.merge(df, spacy_df, left_index=True, right_index=True)
    
    pos_df = pd.DataFrame(df["excerpt"].apply(lambda p : pos_tag_features(p)).tolist(),
                            columns=["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", 
                                    "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "RB", "RBR", "RBS", "RP", "TO", "UH",
                                    "VB", "VBD", "VBG", "VBZ", "WDT", "WP", "WRB"])
    df = pd.merge(df, pos_df, left_index=True, right_index=True)
    
    other_df = pd.DataFrame(df["excerpt"].apply(lambda p : generate_other_features(p)).tolist(),
                            columns=["periods", "commas", "semis", "exclaims", "questions",
                                        "num_char", "num_words", "unique_words", "word_diversity",
                                        "longest_word", "avg_len_word"])
    df = pd.merge(df, other_df, left_index=True, right_index=True)

    return df

def extract_url_license_feat(df):
    temp = pd.DataFrame()
    temp['article_year'] = df['url_legal'].apply(lambda x : x if x is np.nan else re.search('(2\d{3})|$', urlparse(x).path).group())
    temp['subdomain'] = df['url_legal'].apply(lambda x : x if x is np.nan else extract(x)[0])
    temp['domain'] = df['url_legal'].apply(lambda x : x if x is np.nan else extract(x)[1])
    temp['suffix'] = df['url_legal'].apply(lambda x : x if x is np.nan else extract(x)[2])
    temp['is_pdf'] = df['url_legal'].apply(lambda x : x if x is np.nan else ('Y' if '.pdf' in str(x) else 'N'))
    
    temp['is_cc'] = df['license'].apply(lambda x : x if x is np.nan else ('Y' if 'CC' in str(x) else 'N'))
    temp['is_by'] = df['license'].apply(lambda x : x if x is np.nan else ('Y' if 'BY' in str(x) else 'N'))
    temp['is_sa'] = df['license'].apply(lambda x : x if x is np.nan else ('Y' if 'SA' in str(x) else 'N'))
    temp['is_nc'] = df['license'].apply(lambda x : x if x is np.nan else ('Y' if 'NC' in str(x) else 'N'))
    temp['is_nd'] = df['license'].apply(lambda x : x if x is np.nan else ('Y' if 'ND' in str(x) else 'N'))
    temp['is_gnu'] = df['license'].apply(lambda x : x if x is np.nan else ('Y' if 'GNU' in str(x) else 'N'))
    temp['license_version'] = df['license'].apply(lambda x : x if x is np.nan else(float(0) if re.search('([0-9][.][0-9])|$', urlparse(x).path).group() == '' else float(re.search('([0-9][.][0-9])|$', urlparse(x).path).group())))
    df = pd.concat([df, temp], axis = 1)
    return df

def handle_cate_NA(df, columns_to_ignore=[]):
    temp = copy.deepcopy(df)
    cate_cols = list(set(temp.select_dtypes('object').columns.tolist()) - set(columns_to_ignore))
    for col in cate_cols:
        if temp[col].isna().sum() > 0:
            column_name = 'NA_POS_'+col
            col_values = ['Y' if pd.isna(value[1]) else 'N' for value in df[col].items()]
            temp[col].fillna(value='ABS', inplace=True)
            temp[column_name] = col_values
    return temp

def handle_cont_NA(df, method='mean'):
    action = ''.join(c.lower() for c in method if not c.isspace())
    temp = copy.deepcopy(df)
    num_cols = temp.select_dtypes(include='number')
    for col in num_cols:
        if temp[col].isna().sum() > 0:
            column_name = 'NA_POS_'+col
            col_values = ['Y' if pd.isna(value[1]) else 'N' for value in df[col].items()]
            #value_if_true if condition else value_if_false
            fill_value = np.mean(temp[col]) if 'mean' == action else np.median(temp[col])
            temp[col].fillna(value = fill_value, inplace=True)
            temp[column_name] = col_values
    return temp

def train_pca(df, list_of_columns, column_prefix):
    temp = copy.deepcopy(df)
    x = temp.loc[:, list_of_columns].values
    ss = StandardScaler().fit(x)
    x = ss.transform(x)
    pca = PCA(n_components=2)
    pca.fit(x)
    principalComponents = pca.transform(x)
    print(column_prefix, pca.explained_variance_ratio_)
    principalDf = pd.DataFrame(data = principalComponents, columns = [column_prefix+'_1', column_prefix+'_2'])
#     temp.drop(columns=list_of_columns, axis=1, inplace=True)
    temp = pd.concat([temp, principalDf], axis = 1)
    result_dict = { 'pca': pca, 'ss': ss, 'list_of_columns': list_of_columns, 'column_prefix': column_prefix } 
    return result_dict, temp

def apply_pca(trained_pca, df):
    temp = copy.deepcopy(df)
    x = temp.loc[:, trained_pca.get('list_of_columns')].values
    x = trained_pca.get('ss').transform(x)
    principalComponents = trained_pca.get('pca').transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = [trained_pca.get('column_prefix')+'_1', trained_pca.get('column_prefix')+'_2'])
#     temp.drop(columns=trained_pca.get('list_of_columns'), axis=1, inplace=True)
    temp = pd.concat([temp, principalDf], axis = 1)
    return temp

In [None]:
# train_feat = extract_features(train)
# train_feat = extract_url_license_feat(train_feat)
# train_feat = handle_cate_NA(train_feat)
# train_feat = handle_cont_NA(train_feat)
# train_feat.head()

train_feat = pd.read_csv('../input/commonlitfe/train_feat.csv', low_memory=False)

In [None]:
# test_feat = extract_features(test)
# test_feat = extract_url_license_feat(test_feat)
# test_feat = handle_cate_NA(test_feat)
# test_feat = handle_cont_NA(test_feat)
# test_feat.head()

test_feat = pd.read_csv('../input/commonlitfe/test_feat.csv', low_memory=False)

In [None]:
pca_groups = [['smog', 'syll_per_word', 'spacy_29'], ['coleman_liau', 'nominalization', 'IN'], ['spacy_68', 'spacy_86', 'spacy_208', 'spacy_262', 'spacy_147', 'spacy_261'], ['spacy_110', 'spacy_114', 'spacy_298', 'spacy_269', 'spacy_151'], ['spacy_76', 'spacy_122', 'periods', 'spacy_72', 'spacy_196'], ['spacy_4', 'spacy_214', 'spacy_101', 'flesch', 'periods'], ['pronoun', 'spacy_269', 'spacy_294', 'spacy_151', 'spacy_147', 'spacy_110', 'spacy_196'], ['spacy_264', 'spacy_134', 'spacy_122', 'spacy_86', 'spacy_254', 'spacy_72'], ['spacy_76', 'spacy_114', 'spacy_298', 'spacy_69'], ['spacy_28', 'spacy_269', 'spacy_151', 'spacy_122', 'spacy_72', 'spacy_69', 'spacy_134', 'spacy_9', 'spacy_254'], ['spacy_101', 'spacy_214', 'spacy_262', 'spacy_89', 'spacy_110', 'spacy_208'], ['spacy_86', 'spacy_105', 'spacy_249', 'spacy_294', 'VBD', 'spacy_147', 'flesch', 'periods'], ['spacy_28', 'pronoun', 'spacy_122', 'spacy_101', 'spacy_110', 'periods', 'spacy_9'], ['spacy_249', 'PRP', 'spacy_68', 'spacy_294', 'VBD', 'spacy_261', 'spacy_4', 'spacy_298'], ['spacy_76', 'spacy_72', 'spacy_208', 'spacy_89', 'flesch', 'spacy_196', 'spacy_69'], ['spacy_249', 'spacy_68', 'spacy_76', 'spacy_122', 'spacy_208', 'spacy_214', 'spacy_101', 'spacy_254'], ['PRP', 'spacy_114', 'spacy_28', 'spacy_151', 'spacy_4', 'spacy_110', 'spacy_279', 'spacy_232'], ['VBD', 'spacy_264', 'spacy_134', 'spacy_269', 'spacy_261'], ['spacy_249', 'spacy_110', 'spacy_68', 'spacy_9', 'spacy_122', 'spacy_28', 'spacy_147'], ['spacy_269', 'PRP', 'spacy_151', 'spacy_298', 'spacy_101', 'spacy_198', 'spacy_72'], ['spacy_198', 'spacy_28', 'spacy_151', 'spacy_269', 'spacy_261'], ['spacy_249', 'PRP', 'spacy_114', 'spacy_122', 'spacy_110', 'spacy_264', 'spacy_208', 'spacy_133'], ['spacy_214', 'PRP', 'spacy_114', 'spacy_86', 'periods', 'spacy_28'], ['pronoun', 'spacy_76', 'spacy_68', 'spacy_262', 'VBD', 'spacy_122', 'spacy_105', 'spacy_298'], ['spacy_151', 'spacy_134', 'spacy_269', 'spacy_279', 'periods', 'spacy_89', 'spacy_133', 'spacy_147', 'spacy_232'], ['spacy_249', 'spacy_86', 'PRP', 'spacy_114', 'spacy_122', 'spacy_69', 'spacy_294', 'spacy_68', 'spacy_254', 'spacy_110'], ['PRP', 'spacy_214', 'pronoun', 'VBD', 'spacy_114', 'spacy_254', 'spacy_294', 'spacy_261', 'spacy_208', 'spacy_134', 'spacy_4', 'spacy_89', 'spacy_298'], ['spacy_269', 'spacy_249', 'spacy_151', 'spacy_76', 'spacy_122', 'spacy_101', 'periods'], ['spacy_294', 'spacy_214', 'spacy_76', 'spacy_28', 'spacy_86', 'spacy_264', 'spacy_232', 'spacy_122', 'spacy_9'], ['spacy_294', 'spacy_68', 'spacy_122', 'spacy_4', 'spacy_264', 'spacy_261', 'spacy_196', 'spacy_9'], ['spacy_114', 'spacy_249', 'spacy_86', 'spacy_151', 'spacy_134', 'spacy_101', 'spacy_76', 'spacy_254', 'spacy_262'], ['spacy_249', 'PRP', 'pronoun', 'spacy_28', 'spacy_269', 'spacy_114', 'spacy_68', 'spacy_294', 'spacy_122', 'spacy_261', 'spacy_196', 'spacy_72', 'spacy_133'], ['spacy_86', 'spacy_76', 'flesch', 'spacy_4', 'spacy_89', 'spacy_110', 'spacy_9', 'spacy_151'], ['spacy_110', 'spacy_86', 'spacy_208', 'spacy_214', 'spacy_134'], ['spacy_114', 'spacy_269', 'spacy_249', 'spacy_101', 'spacy_76', 'spacy_198', 'spacy_9', 'flesch'], ['spacy_122', 'spacy_294', 'spacy_72'], ['spacy_298', 'spacy_214', 'VBD', 'spacy_114', 'spacy_264', 'spacy_68', 'spacy_9', 'spacy_134', 'spacy_262', 'spacy_4', 'spacy_147', 'flesch'], ['spacy_249', 'spacy_69', 'spacy_105', 'spacy_89', 'spacy_110'], ['preposition', 'smog'], ['spacy_14', 'chars_per_word', 'spacy_29'], ['IN', 'spacy_107'], ['spacy_160', 'nominalization', 'spacy_2'], ['syll_per_word', 'NN', 'spacy_200'], ['coleman_liau', 'NN', 'spacy_60', 'spacy_263'], ['spacy_14', 'dale_chall', 'JJ', 'spacy_182'], ['nominalization', 'spacy_46', 'spacy_155', 'spacy_107'], ['spacy_203', 'smog', 'spacy_149', 'spacy_24'], ['spacy_103', 'avg_len_word', 'num_char', 'spacy_200'], ['spacy_146', 'rix'], ['syll_per_word', 'IN', 'spacy_159', 'spacy_10'], ['preposition', 'spacy_29', 'syll_per_word', 'IN', 'spacy_10'], ['IN', 'spacy_29', 'num_char', 'spacy_182', 'spacy_240', 'spacy_10'], ['chars_per_word', 'avg_len_word', 'nominalization', 'rix', 'spacy_149', 'JJ', 'spacy_2'], ['smog', 'spacy_60', 'spacy_146', 'spacy_97', 'spacy_162'], ['spacy_50', 'spacy_14', 'spacy_38', 'spacy_192', 'dale_chall', 'spacy_24'], ['spacy_203', 'coleman_liau', 'spacy_107', 'preposition'], ['spacy_14', 'coleman_liau', 'spacy_43', 'spacy_182', 'spacy_258'], ['smog', 'rix', 'dale_chall', 'spacy_197', 'spacy_251', 'spacy_107', 'spacy_240'], ['spacy_160', 'IN', 'spacy_27', 'spacy_192', 'JJ', 'nominalization'], ['chars_per_word', 'spacy_7', 'spacy_148', 'spacy_97', 'spacy_159', 'spacy_217', 'lix', 'spacy_200'], ['spacy_155', 'spacy_203', 'avg_len_word', 'spacy_60', 'spacy_252', 'NN', 'spacy_146', 'gunning_fog'], ['syll_per_word', 'num_char', 'spacy_38', 'spacy_24', 'preposition'], ['coleman_liau', 'spacy_2', 'spacy_160', 'spacy_107', 'spacy_162'], ['smog', 'spacy_43', 'spacy_197', 'dale_chall', 'spacy_155'], ['spacy_29', 'IN', 'chars_per_word', 'spacy_252', 'spacy_24', 'preposition', 'lix', 'spacy_200', 'spacy_50'], ['spacy_14', 'syll_per_word', 'spacy_7', 'spacy_27', 'nominalization', 'spacy_211', 'gunning_fog', 'spacy_10'], ['avg_len_word', 'spacy_251', 'rix', 'spacy_30', 'spacy_217', 'spacy_149'], ['spacy_97', 'spacy_149', 'spacy_200', 'spacy_263', 'spacy_162', 'spacy_182', 'gunning_fog'], ['spacy_14', 'spacy_203', 'spacy_43', 'JJ', 'spacy_30', 'dale_chall', 'spacy_146', 'spacy_50', 'spacy_10'], ['coleman_liau', 'spacy_251', 'nominalization', 'spacy_7', 'spacy_46', 'spacy_155', 'preposition', 'spacy_240', 'spacy_266', 'lix'], ['chars_per_word', 'syll_per_word', 'smog', 'IN', 'spacy_38', 'spacy_107', 'spacy_2'], ['nominalization', 'spacy_251', 'IN', 'spacy_155', 'NN', 'spacy_24', 'gunning_fog'], ['smog', 'coleman_liau', 'spacy_14', 'spacy_146', 'spacy_266', 'spacy_240', 'spacy_182'], ['syll_per_word', 'spacy_43', 'spacy_197', 'spacy_46', 'spacy_217', 'spacy_38', 'dale_chall', 'spacy_192', 'rix', 'spacy_200', 'spacy_162'], ['num_char', 'spacy_29', 'lix', 'spacy_10'], ['avg_len_word', 'spacy_10', 'preposition'], ['spacy_217', 'spacy_149', 'spacy_97', 'spacy_24', 'num_char', 'syll_per_word', 'spacy_266'], ['spacy_252', 'spacy_29', 'spacy_14', 'spacy_30', 'spacy_211', 'spacy_192', 'dale_chall', 'spacy_107', 'spacy_60', 'lix'], ['coleman_liau', 'smog', 'spacy_197', 'spacy_251', 'rix', 'spacy_27'], ['spacy_103', 'spacy_160', 'chars_per_word', 'spacy_7', 'spacy_146', 'spacy_240'], ['chars_per_word', 'spacy_29', 'spacy_30', 'spacy_217', 'spacy_197', 'spacy_155', 'spacy_24', 'spacy_97', 'spacy_162', 'spacy_182', 'spacy_10'], ['coleman_liau', 'avg_len_word', 'smog', 'spacy_258', 'spacy_7', 'preposition', 'JJ', 'spacy_240'], ['IN', 'num_char', 'spacy_38', 'lix'], ['spacy_29', 'spacy_2', 'dale_chall', 'spacy_203', 'spacy_263', 'spacy_107', 'spacy_160'], ['coleman_liau', 'syll_per_word', 'avg_len_word', 'nominalization', 'spacy_103', 'num_char', 'spacy_38', 'spacy_192', 'JJ'], ['chars_per_word', 'spacy_252', 'spacy_14', 'spacy_197', 'spacy_211', 'spacy_182', 'spacy_240', 'lix', 'gunning_fog', 'spacy_266'], ['chars_per_word', 'spacy_27', 'spacy_149', 'spacy_197', 'NN', 'preposition', 'spacy_97', 'gunning_fog'], ['nominalization', 'rix', 'spacy_14', 'num_char'], ['coleman_liau', 'syll_per_word', 'avg_len_word', 'smog', 'spacy_103', 'JJ', 'spacy_251', 'dale_chall'], ['spacy_43', 'spacy_2', 'spacy_103', 'spacy_192', 'spacy_240', 'spacy_200', 'spacy_146', 'spacy_149', 'spacy_24', 'spacy_107', 'spacy_263'], ['coleman_liau', 'avg_len_word', 'nominalization', 'smog', 'spacy_7', 'rix', 'spacy_46', 'spacy_29', 'spacy_30', 'spacy_162', 'lix', 'gunning_fog', 'spacy_10'], ['spacy_251', 'spacy_38', 'spacy_60', 'dale_chall', 'num_char', 'spacy_252', 'spacy_266', 'JJ', 'spacy_217', 'spacy_24', 'spacy_182'], ['nominalization', 'spacy_97', 'spacy_148', 'spacy_240'], ['spacy_7', 'spacy_29', 'spacy_14', 'spacy_43', 'spacy_2'], ['coleman_liau', 'chars_per_word', 'syll_per_word', 'rix', 'spacy_146', 'spacy_197', 'spacy_192'], ['spacy_148', 'spacy_258', 'spacy_7', 'spacy_149', 'rix', 'spacy_107', 'dale_chall'], ['syll_per_word', 'avg_len_word', 'smog', 'nominalization', 'spacy_263', 'spacy_203', 'spacy_211', 'preposition', 'spacy_200', 'spacy_162'], ['spacy_251', 'spacy_46', 'spacy_43', 'IN', 'spacy_155', 'gunning_fog'], ['spacy_29', 'spacy_30', 'spacy_38', 'JJ', 'spacy_107', 'spacy_162', 'lix', 'gunning_fog'], ['coleman_liau', 'spacy_46', 'spacy_252', 'spacy_14', 'spacy_251', 'spacy_200', 'nominalization', 'rix', 'num_char', 'spacy_155'], ['syll_per_word', 'chars_per_word', 'spacy_146', 'spacy_217', 'spacy_159', 'spacy_266'], ['chars_per_word', 'syll_per_word', 'IN', 'spacy_146', 'smog'], ['rix', 'spacy_14', 'spacy_263'], ['coleman_liau', 'avg_len_word', 'spacy_43', 'nominalization', 'spacy_203', 'dale_chall'], ['spacy_203', 'spacy_2', 'dale_chall', 'spacy_217', 'spacy_266', 'preposition', 'spacy_159', 'spacy_162', 'lix'], ['num_char', 'spacy_149', 'JJ', 'spacy_46', 'gunning_fog'], ['syll_per_word', 'coleman_liau', 'chars_per_word', 'smog', 'spacy_43', 'nominalization', 'num_char', 'dale_chall', 'spacy_197', 'spacy_192', 'spacy_149', 'spacy_97', 'spacy_146', 'lix'], ['spacy_27', 'rix', 'spacy_103', 'IN', 'spacy_162', 'NN', 'gunning_fog', 'spacy_10'], ['spacy_107', 'preposition', 'spacy_266'], ['chars_per_word', 'avg_len_word', 'smog', 'spacy_103', 'nominalization', 'spacy_46', 'spacy_14'], ['spacy_155', 'spacy_97', 'spacy_27', 'spacy_107', 'spacy_182', 'lix', 'gunning_fog', 'spacy_266'], ['coleman_liau', 'chars_per_word', 'nominalization', 'rix', 'smog', 'spacy_263', 'spacy_2'], ['syll_per_word', 'avg_len_word', 'spacy_103', 'spacy_149'], ['spacy_27', 'dale_chall', 'spacy_211', 'NN', 'spacy_24'], ['spacy_155', 'spacy_27', 'dale_chall', 'spacy_211', 'NN', 'avg_len_word', 'spacy_162', 'spacy_24'], ['spacy_14', 'avg_len_word', 'spacy_160', 'spacy_29', 'spacy_197', 'spacy_30', 'spacy_155', 'spacy_27', 'dale_chall']]

drop_other_fe = ['periods', 'commas', 'semis', 'exclaims', 'questions', 'num_char', 'num_words', 'unique_words', 'word_diversity', 'longest_word', 'avg_len_word']

for index, group in enumerate(pca_groups):
    key = 'f'+str(index)
    pca_res, train_feat = train_pca(train_feat, group, key)
    test_feat = apply_pca(pca_res, test_feat)

train_feat.drop(columns = drop_other_fe, inplace=True, axis = 1)
test_feat.drop(columns = drop_other_fe, inplace=True, axis = 1)

In [None]:
std_error = copy.deepcopy(train_feat['standard_error'])
train_feat.drop(columns=['standard_error'], axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

ignore_cols = ['id','url_legal','license','excerpt', 'target']

for col in train_feat.select_dtypes('object').columns.tolist():
    if col not in ignore_cols:
        lbl = LabelEncoder()
        train_feat[col] = lbl.fit_transform(train_feat[col])
        test_feat[col] = lbl.transform(test_feat[col])

In [None]:
X_train = train_feat[[i for i in train_feat.columns if i not in ignore_cols]]
y_train = train_feat['target']
test_X = test_feat[[i for i in test_feat.columns if i not in ignore_cols]]
[i for i in train_feat.columns if i not in test_feat.columns]

In [None]:
print(X_train.shape)
print(test_X.shape)

In [None]:
from sklearn import metrics, preprocessing, model_selection
import lightgbm as lgb

def runLGB_reg(train_X, train_y, test_X, sample_weight, test_y=None, test_X2=None, dep=8, seed=0, data_leaf=50, rounds=20000):
    params = {}
    params["objective"] = "regression"
    params['metric'] = 'rmse'
    params["max_depth"] = dep
    params["num_leaves"] = 30
    params["min_data_in_leaf"] = data_leaf
    #     params["min_sum_hessian_in_leaf"] = 50
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.8
    params["feature_fraction"] = 0.2
    params["feature_fraction_seed"] = seed
    params["bagging_freq"] = 1
    params["bagging_seed"] = seed
    params["lambda_l2"] = 3
    params["lambda_l1"] = 3
    params["verbosity"] = -1
    num_rounds = rounds

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=500)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)

    loss = 0
    if test_y is not None:
        loss = np.sqrt(metrics.mean_squared_error(test_y, pred_test_y))
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [None]:
print("Building model..")
cv_scores = []
pred_test_full = 0
pred_train = np.zeros(X_train.shape[0])
n_splits = 5
kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=7988)
model_name = "lgb"
for dev_index, val_index in kf.split(X_train, y_train):
    dev_X, val_X = X_train.iloc[dev_index,:], X_train.iloc[val_index,:]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    std_error_x = std_error[dev_index]

    pred_val = 0
    pred_test = 0
    n_models = 0.

    model, loss, pred_v, pred_t = runLGB_reg(dev_X, dev_y, val_X, std_error_x, val_y, test_X, dep=6, data_leaf=200, seed=2019)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
    model, loss, pred_v, pred_t = runLGB_reg(dev_X, dev_y, val_X, std_error_x, val_y, test_X,  dep=7, data_leaf=180, seed=9873)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1

    pred_val /= n_models
    pred_test /= n_models
    
    loss = np.sqrt(metrics.mean_squared_error(val_y, pred_val))
        
    pred_train[val_index] = pred_val
    pred_test_full += pred_test / n_splits
    cv_scores.append(loss)
    print(cv_scores)
print(np.mean(cv_scores))