In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")


pd.options.display.max_colwidth=300

In [2]:
import pickle
from joblib import dump,load

In [3]:
#df_val = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
df_val = pd.read_csv('/home/xiaoguzai/数据/Kaggle Jigsaw Rate Severity of Toxic Comments/validation_true.csv')
df_sub = pd.read_csv("/home/xiaoguzai/数据/Kaggle Jigsaw Rate Severity of Toxic Comments/comments_to_score.csv")

In [4]:
n_folds = 7
val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
test_preds_arr = np.zeros((df_sub.shape[0], n_folds))

In [5]:
import pythonicforbert
from pythonicforbert import get_model_function,FullTokenizer
import torch.nn as nn
from pythonicforbert import get_model_function,FullTokenizer
class ClassificationModel(nn.Module):
    def __init__(self,model,config,n_labels):
        super(ClassificationModel,self).__init__()
        self.model = model
        self.fc = nn.Linear(config.embedding_size,n_labels)
        
    def forward(self,input_ids):
        mask_ids = torch.not_equal(input_ids,1)
        #英文roberta padding=1
        output = self.model(input_ids)
        if mask_ids is not None:
            mask_ids = mask_ids[:,:,None].float()
            output -= 1e-12*(1.0-mask_ids)
        output = output[:,0]
        output = self.fc(output)
        return output

In [6]:
from torch.utils.data import Dataset,DataLoader
from transformers import RobertaTokenizer
vocab_file = '/home/xiaoguzai/模型/roberta-english-base/vocab.json'
 
merges_file = '/home/xiaoguzai/模型/roberta-english-base/merges.txt'
tokenizer = RobertaTokenizer(vocab_file, merges_file)
#tokenizer = RobertaTokenizer.from_pretrained('../input/roberta-base')
class TestDataset(Dataset):
    def __init__(self,text,maxlen):
        self.text = text
        self.maxlen = maxlen
        token_id = []
        for index in tqdm(range(len(self.text))):
            current_text = text[index]
            current_id = tokenizer(current_text)['input_ids']
            #roberta begin:0,end
            #current_id = tokenizer.convert_tokens_to_ids(current_token)
            current_id = self.sequence_padding(current_id)
            token_id.append(current_id)
        self.token_id = token_id
        
    def __len__(self):
        return len(self.token_id)

    def __getitem__(self,index):
        #return [tuple(tensor[index] for tensor in self.tensors)]
        return self.token_id[index]
    
    def sequence_padding(self,inputs,padding=1):
        #英文roberta padding=1
        length = self.maxlen
        if len(inputs) > length:
            inputs = inputs[:length-1]+[inputs[-1]]
        #保留[sep]标志部分
        outputs = []
        pad_width = (0,length-len(inputs))
        x = np.pad(inputs,pad_width,'constant',constant_values=padding)
        return x

In [7]:
import torch
from tqdm import tqdm
import numpy as np
roberta = get_model_function('roberta-base')

In [8]:
lesstoxic_valid_dataset = TestDataset(df_val['less_toxic'],maxlen=500)
lesstoxic_valid_loader = DataLoader(lesstoxic_valid_dataset,batch_size=16)
moretoxic_valid_dataset = TestDataset(df_val['more_toxic'],maxlen=500)
moretoxic_valid_loader = DataLoader(moretoxic_valid_dataset,batch_size=16)
test_dataset = TestDataset(df_sub['text'],maxlen=500)
test_loader = DataLoader(test_dataset,batch_size=16)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

100%|█████████████████████████████████████| 4806/4806 [00:02<00:00, 2228.40it/s]
100%|█████████████████████████████████████| 4806/4806 [00:01<00:00, 2730.15it/s]
100%|█████████████████████████████████████| 7537/7537 [00:02<00:00, 2697.09it/s]


In [9]:
model = torch.load('/home/xiaoguzai/程序/kaggle有毒评论对比比赛/best_score=0.8418643362463587seed=15.pth')
model = model.to(device)
model.eval()
less_toxic1,more_toxic1 = [],[]
result_toxic1 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic1 = less_toxic1+current_point.cpu().numpy().tolist()

for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic1 = more_toxic1+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic1 = result_toxic1+current_point.cpu().numpy().tolist()

100%|█████████████████████████████████████████| 301/301 [00:27<00:00, 10.75it/s]
100%|█████████████████████████████████████████| 301/301 [00:28<00:00, 10.74it/s]
100%|█████████████████████████████████████████| 472/472 [00:43<00:00, 10.74it/s]


In [10]:
model = torch.load('/home/xiaoguzai/程序/kaggle有毒评论对比比赛/best_score=0.8416562630045776seed=4.pth')
model = model.to(device)
model.eval()
less_toxic2,more_toxic2 = [],[]
result_toxic2 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic2 = less_toxic2+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic2 = more_toxic2+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic2 = result_toxic2+current_point.cpu().numpy().tolist()

100%|█████████████████████████████████████████| 301/301 [00:27<00:00, 10.81it/s]
100%|█████████████████████████████████████████| 301/301 [00:27<00:00, 10.82it/s]
100%|█████████████████████████████████████████| 472/472 [00:43<00:00, 10.78it/s]


In [11]:
model = torch.load('/home/xiaoguzai/程序/kaggle有毒评论对比比赛/best_score=0.8410320432792343seed=5.pth')
model = model.to(device)
model.eval()
less_toxic3,more_toxic3 = [],[]
result_toxic3 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic3 = less_toxic3+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic3 = more_toxic3+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic3 = result_toxic3+current_point.cpu().numpy().tolist()

100%|█████████████████████████████████████████| 301/301 [00:27<00:00, 10.79it/s]
100%|█████████████████████████████████████████| 301/301 [00:27<00:00, 10.78it/s]
100%|█████████████████████████████████████████| 472/472 [00:43<00:00, 10.78it/s]


In [12]:
model = torch.load('/home/xiaoguzai/程序/kaggle有毒评论对比比赛/best_score=0.8401997503121099seed=1.pth')
model = model.to(device)
model.eval()
less_toxic4,more_toxic4 = [],[]
result_toxic4 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic4 = less_toxic4+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic4 = more_toxic4+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic4 = result_toxic4+current_point.cpu().numpy().tolist()

100%|█████████████████████████████████████████| 301/301 [00:27<00:00, 10.82it/s]
100%|█████████████████████████████████████████| 301/301 [00:28<00:00, 10.62it/s]
100%|█████████████████████████████████████████| 472/472 [00:45<00:00, 10.46it/s]


In [13]:
model = torch.load('/home/xiaoguzai/程序/kaggle有毒评论对比比赛/best_score=0.8395755305867666seed=8.pth')
model = model.to(device)
model.eval()
less_toxic5,more_toxic5 = [],[]
result_toxic5 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic5 = less_toxic5+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic5 = more_toxic5+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic5 = result_toxic5+current_point.cpu().numpy().tolist()

100%|█████████████████████████████████████████| 301/301 [00:28<00:00, 10.66it/s]
100%|█████████████████████████████████████████| 301/301 [00:28<00:00, 10.64it/s]
100%|█████████████████████████████████████████| 472/472 [00:43<00:00, 10.81it/s]


In [14]:
prin

NameError: name 'prin' is not defined

In [19]:
import time
import multiprocessing
def calculate_data(i,j):
    for k in range(0,31):
        for u in range(0,31):
            v = 100-i-j-k-u
            less_toxic_wt = np.array(less_toxic1).dot(i)+np.array(less_toxic2).dot(j)+np.array(less_toxic3).dot(k)+\
                            np.array(less_toxic4).dot(u)+np.array(less_toxic5).dot(v)
            #less_toxic_wt = i*less_toxic1 + j*less_toxic2 + k*less_toxic3
            more_toxic_wt = np.array(more_toxic1).dot(i)+np.array(more_toxic2).dot(j)+np.array(more_toxic3).dot(k)+\
                            np.array(more_toxic4).dot(u)+np.array(more_toxic5).dot(v)
            wts_acc.append((i,j,k,u,v,
                        np.round((less_toxic_wt < more_toxic_wt).mean() * 100,2))
                          )

In [20]:
wts_acc = multiprocessing.Manager().list()

#multiprocessing.Value,进程间共享这个list
#multiprocessing.Manager().dict()/list(range(5))

start_time = time.time()
for i in tqdm(range(0,31)):
    for j in range(0,31):
        #pool.apply_async(calculate_data,(i,j,))
        p = multiprocessing.Process(target=calculate_data,args=(i,j,))
        p.start()
        p.join()

sorted(wts_acc, key=lambda x:x[5], reverse=True)[:5]
#新的结果不能使用w1,w2,w3，因为w1,w2,w3要作为最后的结果内容
print('%d second'%(time.time()-start_time))

100%|███████████████████████████████████████████| 31/31 [24:40<00:00, 47.75s/it]


1502 second


In [22]:
wts_acc = list(wts_acc)
print(wts_acc)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [24]:
w1,w2,w3,w4,w5,_ = sorted(wts_acc, key=lambda x:x[5], reverse=True)[0]

In [25]:
print('w1 = %f,w2 = %f,w3 = %f,w4 = %f,w5 = %f\n'%(w1,w2,w3,w4,w5))

w1 = 24.000000,w2 = 12.000000,w3 = 30.000000,w4 = 2.000000,w5 = 32.000000



In [26]:
sorted(wts_acc, key=lambda x:x[5], reverse=True)[:5]

[(24, 12, 30, 2, 32, 85.16),
 (21, 17, 30, 2, 30, 85.14),
 (22, 14, 30, 1, 33, 85.14),
 (24, 13, 30, 1, 32, 85.14),
 (24, 13, 30, 2, 31, 85.14)]