# Load libraries

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

import tensorflow as tf
import tensorflow_addons as tfa

import transformers

from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

from datasets import Dataset

import os
import gc
import sys
from tqdm.notebook import tqdm

# Set Configs

In [2]:
CONFIG = {
        'folds': 5,
        'seed': 101,
        'robertabase': '../input/huggingface-roberta-variants/roberta-base/roberta-base',
        'robertalarge': '../input/huggingface-roberta-variants/roberta-large/roberta-large',
        #'debertav3base': '../input/debertav3base',
        #'debertav3large': '../input/deberta-v3-large/deberta-v3-large/',
        'xlmrobertabase': '../input/huggingface-roberta-variants/tf-xlm-roberta-base/tf-xlm-roberta-base',
        'distilrobertabase': '../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base',
        #'debertav3large_npy': '../input/fb3-save-pretrained-embeddings/debertav3large_FB3.npy',
        #'distilrobertabase_npy': '../input/fb3-save-pretrained-embeddings/distilrobertabase_FB3.npy',

        'batch_size': 4,
        'max_len': 512
        }

# Read in data

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
       os.path.join(dirname, filename)

In [4]:
#df = pd.read_csv("/kaggle/input/large580/train_20000.csv")
#msk = np.random.rand(len(df)) <= 0.9
#tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
#train = df[msk].dropna()
#test = df[~msk].dropna()
#test = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv")

In [5]:

train = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
#test = pd.read_csv("/kaggle/input/580data/test.csv")
test = pd.read_csv("/kaggle/input/580data/test_balanced.csv")
tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
#train = train[['text_id','full_text','cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']]
#test = test[['text_id','full_text','cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']]
print(train.shape)
print(test.shape)


(3911, 8)
(783, 11)


In [6]:
test

Unnamed: 0.1,Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,average,bin
0,272,13C400DD9794,The year book is for to not forget anything an...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0
1,3051,D9BC7F4F22F0,Well what i think about praising for a student...,2.5,2.5,2.5,3.0,2.5,2.0,2.5,0
2,800,3E170458E9A1,I\n\ndisagree that first impressions are almos...,2.0,2.0,2.0,2.0,2.0,2.5,2.1,0
3,3206,E0BFF1488787,I disagree with schools having a program with ...,2.5,2.0,2.5,2.0,2.0,2.5,2.3,0
4,2664,C50BE3C76571,I dont like becuase the student forget all inf...,3.0,2.5,2.0,2.0,2.0,2.5,2.4,0
...,...,...,...,...,...,...,...,...,...,...,...
778,2207,A4A90A401002,People who value self-reliance define it as th...,3.0,3.5,3.5,4.0,3.5,4.0,3.6,2
779,2747,CA11FD3CAC43,Many people have been told about the fact that...,4.5,4.0,3.5,4.0,3.5,3.5,3.9,2
780,155,0BB9FAE6E27B,Setting A Good Example\n\nHave you thought of ...,3.5,4.0,4.5,4.0,3.5,3.5,3.9,2
781,3464,ED0A8E614649,Techonology has becoming powerful that let stu...,4.5,3.5,4.0,4.0,4.0,3.5,3.9,2


In [7]:
my_list = train.columns.values.tolist()
my_list

['text_id',
 'full_text',
 'cohesion',
 'syntax',
 'vocabulary',
 'phraseology',
 'grammar',
 'conventions']

In [8]:
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


# Create folds

In [9]:
train.loc[:, 'kfold'] = -1 # Create a new column `fold` containing `-1`s.
train = train.sample(frac=1).reset_index(drop=True) # Shuffle the rows.
data_labels = train[tgtCols].values

In [10]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [11]:
mskf = MultilabelStratifiedKFold(n_splits=5)
for f, (t, v) in enumerate(mskf.split(X=train, y=data_labels)):
    train.loc[v, 'kfold'] = f + 1

In [12]:
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,kfold
0,4E22E3143561,"Agree, living in a world where you sometimes h...",3.0,3.0,3.5,3.5,3.0,3.5,3
1,E21D14227555,A wise man once said ''the usage of technology...,3.5,3.5,3.5,3.5,3.5,3.0,5
2,BF267B42A2A4,The humanity now in days had been not so good ...,3.0,3.0,3.0,3.5,3.5,3.5,2
3,B5AE26EEE247,Do you think that having a positive attitude i...,4.0,3.5,3.5,3.5,3.5,3.5,1
4,6013DA298542,Schools have partnered with local companies to...,2.0,2.5,3.0,3.0,3.0,2.5,4
...,...,...,...,...,...,...,...,...,...
3906,00D281524375,Technology allows people to do many things suc...,3.5,2.5,3.5,3.0,3.0,3.0,1
3907,D242BBCBEAFB,Some student can take classes at home because ...,2.5,2.5,2.5,2.5,2.5,2.5,4
3908,EBE0730AFF0E,Adopting the failure is the way to become mast...,3.5,2.5,3.5,3.5,2.5,2.5,3
3909,D86C20117DC9,Students should work in groups or should works...,4.0,4.0,3.5,3.5,3.5,3.0,5


In [13]:
train['kfold'].value_counts().sort_index()

1    782
2    782
3    782
4    783
5    782
Name: kfold, dtype: int64

## Data process functions

In [14]:
def self_encode(texts, chkpt):
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(CONFIG[chkpt])
    tokenizer.save_pretrained('./tokenizer/')

    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=CONFIG['max_len'], 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [15]:
def pickle_dump(path, saveobj):
    import pickle
    handler = open(path,"wb")
    pickle.dump(saveobj,handler)
#     print("File pickled")
    handler.close()

In [16]:
def pickle_load(path):
    import pickle
    file = open(path,'rb')
    loader = pickle.load(file)
    file.close()
    return loader

## Transformer embeddings

In [17]:
def pretrain_embeddings(chkpt, df):
    cfg = transformers.AutoConfig.from_pretrained(CONFIG[chkpt], output_hidden_states=True)
    cfg.hidden_dropout_prob = 0
    cfg.attention_probs_dropout_prob = 0
    cfg.save_pretrained('./tokenizer/')
    
    input_ids = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="attention_masks"
    )
    
    try:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg)
    except:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg, from_pt=True)
        
    output = model(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = output.hidden_states
    mean_pool = []
    for hidden_s in hidden_states[-1:]:
        #def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(attention_masks, "float32"), -1)
        embedding_sum = tf.reduce_sum(hidden_s * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        tmp = embedding_sum / mask_sum
        mean_pool.append(tmp)
    output = tf.stack(mean_pool,axis=2)
   
    #output = tf.stack(
    #    [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidd20000en_states[-1:]], 
    #    axis=2)
    
    output = tf.squeeze(output, axis=-1)
    
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)

    model.compile(optimizer="adam",
                 loss='huber_loss',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    print(model.summary())
    dataset = self_encode(df['full_text'], chkpt)
    preds = model.predict(dataset, batch_size=CONFIG['batch_size'])
    
    del model, dataset
    _ = gc.collect()
    
    return preds

# Model training

In [18]:

train_data = pretrain_embeddings('distilrobertabase', train)

#train_data = np.concatenate([train_data, pretrain_embeddings('bertbasecased', train)], axis=1)
train_data = np.concatenate([train_data, pretrain_embeddings('robertabase', train)], axis=1)
train_data = np.concatenate([train_data, pretrain_embeddings('robertalarge', train)], axis=1)

train_data.shape

2022-11-30 00:10:46.561274: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 00:10:46.562230: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 00:10:46.563258: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 00:10:46.564011: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 00:10:46.564744: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast (TFOpLambda)            (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]        

2022-11-30 00:11:12.252824: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_1 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_1 (TFRobertaMo TFBaseModelOutputWit 124645632   input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(3911, 2560)

In [19]:
scores = []
rmse_scores = []

for fold in range(1,CONFIG['folds']):

    print('-'*35)
    print(f'## Fold {fold}')
    print('-'*35)

    trn_idx = train[train['kfold']==fold].index.values
    val_idx = train[train['kfold']!=fold].index.values
    print(f"trn_idx len is {len(trn_idx)}")

    X_train = train_data[trn_idx,:]
    X_valid = train_data[val_idx,:]

    y_train = train[train['kfold']==fold][tgtCols].copy()
    y_valid = train[train['kfold']!=fold][tgtCols].copy()

    val_preds = np.zeros((len(val_idx),6))

    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        clf = SVR(C=10)
        clf.fit(X_train, y_train[tgt].values)
        pickle_dump(f"./SVR_tgt{tgt}_fold{fold}.pkl", clf)
        val_preds[:,i] = clf.predict(X_valid)
   
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('-'*35)
    print('Overall CV RMSE =',np.mean(scores))


-----------------------------------
## Fold 1
-----------------------------------
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.4667085991638542
-----------------------------------
Overall CV RMSE = 0.473426100886325
-----------------------------------
## Fold 2
-----------------------------------
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.470851926925759
-----------------------------------
Overall CV RMSE = 0.4720277924698781
-----------------------------------
## Fold 3
-----------------------------------
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.47085745521560124
-----------------------------------
Overall CV RMSE = 0.4718961607960865
-----------------------------------
## Fold 4
-----------------------------------
trn_idx len is 783
cohesion , syntax , vocabulary , phraseology 

In [20]:
del train_data
_ = gc.collect()

# Model inference on Balanced Test

In [21]:
test_data = pretrain_embeddings('distilrobertabase', test)

#test_data = np.concatenate([test_data, pretrain_embeddings('bertbasecased', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertabase', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertalarge', test)], axis=1)

test_data.shape

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_3 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_3 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(783, 2560)

In [22]:
fold_preds = []

for fold in range(1,CONFIG['folds']):

    print('-'*35)
    print(f'## Fold {fold}')
    print('-'*35)
    
    test_preds = np.zeros((len(test_data),6))
    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        model = pickle_load(f"./SVR_tgt{tgt}_fold{fold}.pkl")
        test_preds[:,i] = model.predict(test_data)
    
    fold_preds.append(test_preds)
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('-'*35)
    print('Overall CV RMSE =',np.mean(scores))
    
    del model
    _ = gc.collect()

-----------------------------------
## Fold 1
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.4706498256951178
-----------------------------------
Overall CV RMSE = 0.47154122214356003
-----------------------------------
## Fold 2
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.47059791831499687
-----------------------------------
Overall CV RMSE = 0.4714197885054357
-----------------------------------
## Fold 3
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.4705608416149106
-----------------------------------
Overall CV RMSE = 0.47132239286579874
-----------------------------------
## Fold 4
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 4 RMSE score: 0.47053303408984587
-----

In [23]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [24]:
output_df = test[['text_id']].reset_index()
output_df

Unnamed: 0,index,text_id
0,0,13C400DD9794
1,1,D9BC7F4F22F0
2,2,3E170458E9A1
3,3,E0BFF1488787
4,4,C50BE3C76571
...,...,...
778,778,A4A90A401002
779,779,CA11FD3CAC43
780,780,0BB9FAE6E27B
781,781,ED0A8E614649


In [25]:
preds_df = pd.DataFrame(preds, columns = tgtCols)
preds_df

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,1.800074,1.804792,2.178533,1.920928,1.800869,1.883982
1,2.748422,2.644605,2.971835,2.792340,2.528817,2.422574
2,1.938913,1.796602,2.284502,1.923307,1.714191,2.035963
3,2.479802,2.568310,2.683838,2.774912,2.789020,2.794571
4,2.273604,2.116479,2.342235,2.043774,1.933587,2.087017
...,...,...,...,...,...,...
778,3.629048,3.534972,3.808915,3.730619,3.613037,3.660695
779,3.804959,3.595449,3.729805,3.638323,3.525637,3.746667
780,3.524233,3.557455,3.792438,3.612432,3.485740,3.649140
781,3.802464,3.574562,3.808553,3.664019,3.454725,3.546447


In [26]:
preds_df['text_id'] = output_df['text_id']
preds_df = preds_df.reindex(['text_id', *preds_df.columns], axis=1).iloc[: , :-1]
preds_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,13C400DD9794,1.800074,1.804792,2.178533,1.920928,1.800869,1.883982
1,D9BC7F4F22F0,2.748422,2.644605,2.971835,2.792340,2.528817,2.422574
2,3E170458E9A1,1.938913,1.796602,2.284502,1.923307,1.714191,2.035963
3,E0BFF1488787,2.479802,2.568310,2.683838,2.774912,2.789020,2.794571
4,C50BE3C76571,2.273604,2.116479,2.342235,2.043774,1.933587,2.087017
...,...,...,...,...,...,...,...
778,A4A90A401002,3.629048,3.534972,3.808915,3.730619,3.613037,3.660695
779,CA11FD3CAC43,3.804959,3.595449,3.729805,3.638323,3.525637,3.746667
780,0BB9FAE6E27B,3.524233,3.557455,3.792438,3.612432,3.485740,3.649140
781,ED0A8E614649,3.802464,3.574562,3.808553,3.664019,3.454725,3.546447


In [27]:
preds_df.to_csv('tesths.csv')

## Running the final text dataset

In [28]:
test = pd.read_csv("/kaggle/input/580data/test.csv")

In [29]:
test_data = pretrain_embeddings('distilrobertabase', test)

#test_data = np.concatenate([test_data, pretrain_embeddings('bertbasecased', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertabase', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertalarge', test)], axis=1)

test_data.shape

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_6 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_6 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(8, 2560)

In [30]:
fold_preds = []

for fold in range(1,CONFIG['folds']):

    print('-'*35)
    print(f'## Fold {fold}')
    print('-'*35)
    
    test_preds = np.zeros((len(test_data),6))
    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        model = pickle_load(f"./SVR_tgt{tgt}_fold{fold}.pkl")
        test_preds[:,i] = model.predict(test_data)
    
    fold_preds.append(test_preds)
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('-'*35)
    print('Overall CV RMSE =',np.mean(scores))
    
    del model
    _ = gc.collect()

-----------------------------------
## Fold 1
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.4705114060147955
-----------------------------------
Overall CV RMSE = 0.47117587938357997
-----------------------------------
## Fold 2
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.47049410355475524
-----------------------------------
Overall CV RMSE = 0.4711192888782515
-----------------------------------
## Fold 3
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.4704799469965404
-----------------------------------
Overall CV RMSE = 0.4710706062021693
-----------------------------------
## Fold 4
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 4 RMSE score: 0.4704681498646947
-------

In [31]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [32]:
preds_df = pd.DataFrame(preds, columns = tgtCols)
preds_df

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,2.838727,2.748008,3.025828,2.893462,2.60169,2.597918
1,2.722521,2.461244,2.749818,2.476476,2.115154,2.62211
2,3.460912,3.354621,3.550451,3.457997,3.360515,3.341161
3,3.277663,3.249178,3.499111,3.319013,3.306077,3.027291
4,3.730765,3.690547,3.956144,3.722939,3.635976,3.484476
5,3.706104,3.621633,3.981173,3.628974,3.575151,3.778684
6,3.866864,3.803552,4.026429,3.899598,3.796488,3.807109
7,4.073992,3.844181,4.063811,3.883764,3.765704,3.903198


In [33]:
output_df = test[['text_id']].reset_index()
output_df

Unnamed: 0,index,text_id
0,0,0000C359D63E
1,1,000BAD50D026
2,2,00367BB2546B
3,3,hp
4,4,tkm
5,5,high school
6,6,college1
7,7,college2


In [34]:
preds_df['text_id'] = output_df['text_id']
preds_df = preds_df.reindex(['text_id', *preds_df.columns], axis=1).iloc[: , :-1]
preds_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.838727,2.748008,3.025828,2.893462,2.60169,2.597918
1,000BAD50D026,2.722521,2.461244,2.749818,2.476476,2.115154,2.62211
2,00367BB2546B,3.460912,3.354621,3.550451,3.457997,3.360515,3.341161
3,hp,3.277663,3.249178,3.499111,3.319013,3.306077,3.027291
4,tkm,3.730765,3.690547,3.956144,3.722939,3.635976,3.484476
5,high school,3.706104,3.621633,3.981173,3.628974,3.575151,3.778684
6,college1,3.866864,3.803552,4.026429,3.899598,3.796488,3.807109
7,college2,4.073992,3.844181,4.063811,3.883764,3.765704,3.903198


In [35]:
preds_df.to_csv('hsfinal.csv')

## Running SVR after TF-IDF

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
import math
from sklearn.svm import SVR

## Model inference on Balanced Test

In [23]:
train = pd.read_csv("/kaggle/input/580data/train_balanced.csv")
test = pd.read_csv("/kaggle/input/580data/test_balanced.csv")

In [24]:
full_df = np.concatenate((train.full_text.values,test.full_text.values))

In [25]:
tfidf_featurizer = TfidfVectorizer(max_features=10000, max_df=0.98, stop_words='english')
X_tfidf = tfidf_featurizer.fit_transform(full_df)

In [27]:
# SPLIT DATA
tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
X_train, X_test, y_train, y_test = train_test_split(X_tfidf[0:len(train.full_text)], 
                                                    train[tgtCols].values,
                                                    test_size=0.10,
                                                    random_state=42)

In [28]:
print(X_train.shape)
print(X_test.shape)

(2815, 10000)
(313, 10000)


In [29]:
best_params = {'C' : 10, 
                'epsilon': 0.1, 
                'gamma' : 1, 
                'kernel' : 'rbf'} 

In [30]:
data_test = X_tfidf[len(train.full_text):]
df_sum = pd.DataFrame([],index=test.text_id,columns= tgtCols)

In [31]:
svr_clf = SVR(**best_params)
rerror = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(X_test)
  rerror.append(mse(rf_preds,y_test[:,k]))
  MSE = np.mean(rerror)
  RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

Root Mean Square Error:

0.6059682633613659


In [56]:
test_test = test.iloc[0:,3:9]
test_test

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,2.0,2.0,2.0,2.0,2.0,2.0
1,2.5,2.5,2.5,3.0,2.5,2.0
2,2.0,2.0,2.0,2.0,2.0,2.5
3,2.5,2.0,2.5,2.0,2.0,2.5
4,3.0,2.5,2.0,2.0,2.0,2.5
...,...,...,...,...,...,...
778,3.0,3.5,3.5,4.0,3.5,4.0
779,4.5,4.0,3.5,4.0,3.5,3.5
780,3.5,4.0,4.5,4.0,3.5,3.5
781,4.5,3.5,4.0,4.0,4.0,3.5


In [57]:
testarray = test_test.to_numpy()
testarray[0]

array([2., 2., 2., 2., 2., 2.])

In [64]:
svr_clf = SVR(**best_params)
error = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(data_test)
  #print(rf_preds)
  df_sum[tgtCols[k]] = rf_preds
  rerror.append(mse(rf_preds,testarray[:,k]))
  MSE = np.mean(rerror)
  RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)
  #error.append(rmse(rf_preds,y_test[:,k],squared=False))

Root Mean Square Error:

0.5811373724702388


In [65]:
df_sum

Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13C400DD9794,2.853360,2.850215,2.848165,2.837042,3.005184,2.759427
D9BC7F4F22F0,2.737378,2.419024,2.698912,2.476842,2.503007,2.221925
3E170458E9A1,2.740506,2.642922,2.872641,2.793128,2.755787,2.731921
E0BFF1488787,2.786685,2.850549,2.972727,2.767457,2.806187,3.038367
C50BE3C76571,2.732530,2.602709,2.823353,2.491031,2.213038,2.485433
...,...,...,...,...,...,...
A4A90A401002,3.376377,3.209147,3.556211,3.173799,3.539506,3.412993
CA11FD3CAC43,3.719921,3.441376,3.739208,3.654375,3.216852,3.539730
0BB9FAE6E27B,3.354057,3.210798,3.567047,3.134979,3.391220,3.483420
ED0A8E614649,3.361802,3.370543,3.564252,3.569192,3.271189,3.454624


In [66]:
df_sum.to_csv('svrtest.csv')

## Model inference on Final Test

In [17]:
train = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
test = pd.read_csv("/kaggle/input/580data/test.csv")

In [18]:
# Running for the train full_text with training all
# fit the six test as test
full_df = np.concatenate((train.full_text.values,test.full_text.values))

In [19]:
tfidf_featurizer = TfidfVectorizer(max_features=10000, max_df=0.98, stop_words='english')
X_tfidf = tfidf_featurizer.fit_transform(full_df)

In [20]:
# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X_tfidf[0:len(train.full_text)], 
                                                    train[tgtCols].values,
                                                    test_size=0.10,
                                                    random_state=42)

In [40]:
print(X_train.shape)
print(X_test.shape)

(3519, 10000)
(392, 10000)


In [41]:
best_params = {'C' : 10, 
                'epsilon': 0.1, 
                'gamma' : 1, 
                'kernel' : 'rbf'} 

In [42]:
data_test = X_tfidf[len(train.full_text):]
df_sum = pd.DataFrame([],index=test.text_id,columns= tgtCols)

In [44]:
svr_clf = SVR(**best_params)
rerror = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(X_test)
  rerror.append(mse(rf_preds,y_test[:,k]))
  MSE = np.mean(rerror)
  RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)


Root Mean Square Error:

0.5588277727742706


In [45]:
svr_clf = SVR(**best_params)
error = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(data_test)
  df_sum[tgtCols[k]] = rf_preds
  #error.append(rmse(rf_preds,y_test[:,k],squared=False))



In [46]:
df_sum

Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,2.907619,2.72479,3.150306,3.04753,2.636049,2.737348
000BAD50D026,3.000263,2.81182,2.974439,2.673552,2.731733,3.052494
00367BB2546B,3.354401,3.437521,3.444208,3.359888,3.31463,3.320433
hp,2.71297,2.675136,2.985553,2.855222,2.794587,2.725889
tkm,2.82263,2.799316,3.000837,2.851043,2.873569,2.749054
high school,2.928985,2.794508,3.178074,2.917375,2.868357,2.83907
college1,3.045923,3.017985,3.273694,3.076371,3.00961,2.888542
college2,3.026962,2.95194,3.25654,3.008387,2.99578,2.981578


In [47]:
df_sum.to_csv('svr.csv')