In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")


pd.options.display.max_colwidth=300

In [2]:
import pickle
from joblib import dump,load

In [3]:
df_val = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
df_sub = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [4]:
n_folds = 7
val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
test_preds_arr = np.zeros((df_sub.shape[0], n_folds))

In [5]:
for fld in range(n_folds):
    pipeline = load('/kaggle/input/toxiclinearregression/withclean'+str(fld)+'.joblib')
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])
    
    print("\npredict test data ")
    test_preds_arr[:,fld] = pipeline.predict(df_sub['text'])
    


Total number of features: 75032
[('vect3__uck', 0.39),
 ('vect3__ f ', 0.36),
 ('vect3__fuc', 0.32),
 ('vect3__fuck', 0.31),
 ('vect3__shit', 0.31),
 ('vect3__ ass', 0.28),
 ('vect3__ass', 0.28),
 ('vect3__hit', 0.28),
 ('vect3__ rape', 0.27),
 ('vect3__ fu', 0.26),
 ('vect3__ nl', 0.26),
 ('vect3__nl3', 0.26),
 ('vect3__ g ', 0.24),
 ('vect3__nig', 0.24),
 ('vect3__bitch', 0.23),
 ('vect3__fag', 0.23),
 ('vect3__suck', 0.23),
 ('vect3__ fuc', 0.22),
 ('vect3__ gay', 0.22),
 ('vect3__ rap', 0.22),
 ('vect3__ shit', 0.22),
 ('vect3__rape ', 0.22),
 ('vect3__ fag', 0.21),
 ('vect3__ fuck', 0.21),
 ('vect3__bitc', 0.21),
 ('vect3__nigg', 0.21),
 ('vect3__ * ', 0.2),
 ('vect3__ as', 0.2),
 ('vect3__ nig', 0.2),
 ('vect3__ie ', 0.2)]

predict validation data 

predict test data 

Total number of features: 74635
[('vect3__uck', 0.42),
 ('vect3__fuc', 0.33),
 ('vect3__fuck', 0.3),
 ('vect3__ f ', 0.29),
 ('vect3__ fu', 0.29),
 ('vect3__ nl', 0.29),
 ('vect3__ck ', 0.27),
 ('vect3__ie ', 0.27

In [6]:
val_preds_arr1c = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2c = np.zeros((df_val.shape[0], n_folds))
test_preds_arrc = np.zeros((df_sub.shape[0], n_folds))

In [7]:
for fld in range(n_folds):
    pipeline = load('/kaggle/input/toxiclinearregression/withoutclean'+str(fld)+'.joblib')
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1c[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2c[:,fld] = pipeline.predict(df_val['more_toxic'])
    
    print("\npredict test data ")
    test_preds_arrc[:,fld] = pipeline.predict(df_sub['text'])


Total number of features: 80874
[('vect3__uck', 0.43),
 ('vect3__fuc', 0.36),
 ('vect3__fuck', 0.32),
 ('vect3__you ', 0.32),
 ('vect3__ ass', 0.31),
 ('vect3__ass', 0.3),
 ('vect3__shit', 0.3),
 ('vect3__ you ', 0.28),
 ('vect3__hit', 0.28),
 ('vect3__ fu', 0.27),
 ('vect3__ nl', 0.26),
 ('vect3__ f ', 0.25),
 ('vect3__ rape', 0.25),
 ('vect3__nl3', 0.25),
 ('vect3__ g ', 0.24),
 ('vect3__ u ', 0.24),
 ('vect3__bitch', 0.24),
 ('vect3__fag', 0.24),
 ('vect3__nig', 0.24),
 ('vect3__ fag', 0.23),
 ('vect3__ gay', 0.23),
 ('vect3__nigg', 0.23),
 ('vect3__ fuc', 0.22),
 ('vect3__ shit', 0.22),
 ('vect3__bitc', 0.22),
 ('vect3__suck', 0.22),
 ('vect3__uck ', 0.22),
 ('vect3__wwwww', 0.22),
 ('vect3__ rap', 0.21),
 ('vect3__!!!', 0.21)]

predict validation data 

predict test data 

Total number of features: 80314
[('vect3__uck', 0.46),
 ('vect3__fuc', 0.34),
 ('vect3__fuck', 0.3),
 ('vect3__ fu', 0.28),
 ('vect3__ nl', 0.28),
 ('vect3__l33', 0.28),
 ('vect3__ ass', 0.27),
 ('vect3__ die '

In [8]:
val_preds_arr1_ = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2_ = np.zeros((df_val.shape[0], n_folds))
test_preds_arr_ = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    pipeline = load('/kaggle/input/rudditridgeregression/withruddit'+str(fld)+'.joblib')
    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    dump(pipeline,'/kaggle/working/withruddit'+str(fld)+'.joblib')
    print("\npredict validation data ")
    val_preds_arr1_[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2_[:,fld] = pipeline.predict(df_val['more_toxic'])

    print("\npredict test data ")
    test_preds_arr_[:,fld] = pipeline.predict(df_sub['text'])

[('vect3__fuc', 0.59),
 ('vect3__fuck', 0.59),
 ('vect3__uck', 0.55),
 ('vect3__ fuc', 0.52),
 ('vect3__ fuck', 0.52),
 ('vect3__ fu', 0.42),
 ('vect3__shit', 0.4),
 ('vect3__hit', 0.37),
 ('vect3__ shit', 0.36),
 ('vect3__fuck ', 0.34),
 ('vect3__ck ', 0.32),
 ('vect3__uck ', 0.31),
 ('vect3__shi', 0.29),
 ('vect3__ shi', 0.28),
 ('vect3__ ass ', 0.27),
 ('vect3__sex', 0.26),
 ('vect3__ ass', 0.25),
 ('vect3__ dick', 0.25),
 ('vect3__dick', 0.25),
 ('vect3__ dic', 0.24),
 ('vect3__ sex', 0.24),
 ('vect3__ sh', 0.23),
 ('vect3__shit ', 0.23),
 ('vect3__ass', 0.21),
 ('vect3__ di', 0.19),
 ('vect3__ du', 0.18),
 ('vect3__!!!', 0.18),
 ('vect3__dic', 0.18),
 ('vect3__ex ', 0.18),
 ('vect3__hit ', 0.18)]

predict validation data 

predict test data 
[('vect3__fuc', 0.58),
 ('vect3__fuck', 0.58),
 ('vect3__ fuc', 0.53),
 ('vect3__ fuck', 0.53),
 ('vect3__uck', 0.53),
 ('vect3__ fu', 0.41),
 ('vect3__shit', 0.38),
 ('vect3__hit', 0.36),
 ('vect3__ shit', 0.35),
 ('vect3__ck ', 0.35),
 ('vec

In [9]:
print(" Toxic data ")
p1 = val_preds_arr1.mean(axis=1)
p2 = val_preds_arr2.mean(axis=1)

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

print(" Ruddit data ")
p3 = val_preds_arr1_.mean(axis=1)
p4 = val_preds_arr2_.mean(axis=1)

print(f'Validation Accuracy is { np.round((p3 < p4).mean() * 100,2)}')

print(" Toxic CLEAN data ")
p5 = val_preds_arr1c.mean(axis=1)
p6 = val_preds_arr2c.mean(axis=1)

print(f'Validation Accuracy is { np.round((p5 < p6).mean() * 100,2)}')

 Toxic data 
Validation Accuracy is 67.47
 Ruddit data 
Validation Accuracy is 62.57
 Toxic CLEAN data 
Validation Accuracy is 68.17


In [10]:
print("Find right weight linear")

wts_acc = []
for i in range(30,70,1):
    for j in range(0,20,1):
        w1 = i/100
        w2 = (100 - i - j)/100
        w3 = (1 - w1 - w2 )
        p1_wt = w1*p1 + w2*p3 + w3*p5
        p2_wt = w1*p2 + w2*p4 + w3*p6
        wts_acc.append( (w1,w2,w3, 
                         np.round((p1_wt < p2_wt).mean() * 100,2))
                      )
sorted(wts_acc, key=lambda x:x[3], reverse=True)[:5]

Find right weight linear


[(0.69, 0.12, 0.19000000000000006, 67.93),
 (0.68, 0.13, 0.18999999999999995, 67.92),
 (0.69, 0.13, 0.18000000000000005, 67.92),
 (0.65, 0.16, 0.18999999999999997, 67.91),
 (0.67, 0.14, 0.18999999999999995, 67.91)]

In [11]:
p1#p1代表less_toxic中的预测结果

array([0.15562062, 0.07953136, 0.13364651, ..., 0.20036902, 0.20036902,
       0.20036902])

In [12]:
w1,w2,w3,_ = sorted(wts_acc, key=lambda x:x[3], reverse=True)[0]
#print(best_wts)

p1_wt = w1*p1 + w2*p3 + w3*p5
p2_wt = w1*p2 + w2*p4 + w3*p6


In [13]:
len(df_val)

30108

In [14]:
df_val['p1'] = p1_wt
df_val['p2'] = p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)

df_val['correct'] = (p1_wt < p2_wt).astype('int')
### Incorrect predictions with dis-similar scores
df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)


Unnamed: 0,worker,less_toxic,more_toxic,p1,p2,diff,correct
25062,246,CaliforniaAliBaba is a Bitch\nGo fuck yourself you bitch ass motherfucker. Don't you ever in your fucking life threatening me you fucking piece of shit you. You don't know me I will fucking kill your ass you punk pussy bitch. tell your mama to my black dick.,I've heard a snippet on their myspace page but I'm afraid I haven't given them much though musically as of yet. Perhaps I will be purchasing one of their albums since I seem to spend about a third of my time on wiki reverting their vandalism.,1.056383,0.09456,0.961823,0
25754,190,what the hell man? why did you delete my page (savannah hoard) screw you dude\n\ni mean really? if you're not gonna let me have a page then you can go fuck your fat self you 40 year old bitch\n\nyour a total bitch who goes around deleting people's pages you fucking disgust me im gonna kick your ...,""" December 2009 (UTC)\n\n Now there's typical AGW alarmist Watermelon argumentum ad hominem content-vacant suppressive authoritarian WikiNazi rottenness if ever it got posted online. Don't address Monckton's (or Nsaa's) position, but strive (with utter ineffectiveness, I might add) to fault the ...",0.929921,0.169816,0.760106,0
28241,118,you are a stupid fuck \n\nand your mother's cunt stinks,"""\n\n ::: According to Stratfordian authority H.N.Gibson(frequently cited in this very article) the controversy began with John Marston and Joseph Hall in 1597 which view was even more fully affirmed by Stratfordian authorities Garnett and Gosse over fifty years previously. According to Strat...",0.804608,0.098318,0.70629,0
1952,431,ASSHOLES! COME AND TRY ME YOU STUPID FUCK SON OF A BITCHES!!!!,"""\n\nPlease, Cheesebot is not nonsense. It is a formidible and tangible thing, can you please not delete this page again? I don't believe you are the authority in human experience in a position to deem other people's political and personal movements as """"nonsense."""" I appreciate. """,0.868073,0.169549,0.698524,0
19411,46,"Ok, what is it \n\nWhy are you such a queer.. you have no power, so why the FUCK are you acting like you do. Grow up you fucking stupid shit head cock eating bastard.. seriously! Get a life, you're just a 17 yo piece of shit.",wow the guy who deleated the stuff on the Habbo NSM is leet. 88!,0.859645,0.179722,0.679924,0
21059,333,fuck you \n\nyou're an idiot,"What makes you an editor? Why do you use the name of a great dead man? What is your true name, profession, country? \nShamir",0.888629,0.213125,0.675505,0
29056,204,"Fuck You\n\nGo suck on a fat dick faggot mother fucker. I'm the the wrong person to fuck with you puck ass pussy! I would keep my moth shut, or I can shut for you!","""\n\nFUCCK U A$$HOLE do wateva u want DIICK HEAD Contact Blogsd ! """,1.002026,0.340888,0.661138,0
29057,716,"Fuck You\n\nGo suck on a fat dick faggot mother fucker. I'm the the wrong person to fuck with you puck ass pussy! I would keep my moth shut, or I can shut for you!","""\n\nFUCCK U A$$HOLE do wateva u want DIICK HEAD Contact Blogsd ! """,1.002026,0.340888,0.661138,0
10376,88,Yo Wuhwuzdat u a bitch wit no life. Yeah u heard me u a bitch. fuck u motherfucka,"""\n\n Name \n\nPaper Wrapped Cake? That is lame. That's not even its proper name! How about sponge cake? — ¡ρρ¡ ¡ρρ¡ """,0.735965,0.081028,0.654937,0
22659,386,"YOU SUCK \n\nI HATE YOU> YOU CAN FUCK MY ASS ALL NIGHT BITCH SHit\n\nLove , David Ortiz","""\n\n Antony and the Johnsons \n\nWell, Cardinal, is """"Antony's voice seems to channel Nina Simone and Bryan Ferry, and he has many celebrity admirers such as Philip Glass, Marc Almond, Lou Reed and the guest vocalists on I Am a Bird Now, Boy George, Rufus Wainwright and Devendra Banhart"""" an op...",0.815075,0.163485,0.65159,0


In [15]:
pip install ../input/pythonicforbertuse/pythonicforbert-1.0.12-py3-none-any.whl

Processing /kaggle/input/pythonicforbertuse/pythonicforbert-1.0.12-py3-none-any.whl
Installing collected packages: pythonicforbert
Successfully installed pythonicforbert-1.0.12
Note: you may need to restart the kernel to use updated packages.


In [16]:
import pythonicforbert
from pythonicforbert import get_model_function,FullTokenizer
import torch.nn as nn
from pythonicforbert import get_model_function,FullTokenizer
class ClassificationModel(nn.Module):
    def __init__(self,model,config,n_labels):
        super(ClassificationModel,self).__init__()
        self.model = model
        self.fc = nn.Linear(config.embedding_size,n_labels)
        
    def forward(self,input_ids):
        mask_ids = torch.not_equal(input_ids,1)
        #英文roberta padding=1
        output = self.model(input_ids)
        if mask_ids is not None:
            mask_ids = mask_ids[:,:,None].float()
            output -= 1e-12*(1.0-mask_ids)
        output = output[:,0]
        output = self.fc(output)
        return output

In [17]:
from torch.utils.data import Dataset,DataLoader
from transformers import RobertaTokenizer
vocab_file = '/kaggle/input/roberta-base/vocab.json'
 
merges_file = '/kaggle/input/roberta-base/merges.txt'
tokenizer = RobertaTokenizer(vocab_file, merges_file)
#tokenizer = RobertaTokenizer.from_pretrained('../input/roberta-base')
class TestDataset(Dataset):
    def __init__(self,text,maxlen):
        self.text = text
        self.maxlen = maxlen
        token_id = []
        for index in tqdm(range(len(self.text))):
            current_text = text[index]
            current_id = tokenizer(current_text)['input_ids']
            #roberta begin:0,end
            #current_id = tokenizer.convert_tokens_to_ids(current_token)
            current_id = self.sequence_padding(current_id)
            token_id.append(current_id)
        self.token_id = token_id
        
    def __len__(self):
        return len(self.token_id)

    def __getitem__(self,index):
        #return [tuple(tensor[index] for tensor in self.tensors)]
        return self.token_id[index]
    
    def sequence_padding(self,inputs,padding=1):
        #英文roberta padding=1
        length = self.maxlen
        if len(inputs) > length:
            inputs = inputs[:length-1]+[inputs[-1]]
        #保留[sep]标志部分
        outputs = []
        pad_width = (0,length-len(inputs))
        x = np.pad(inputs,pad_width,'constant',constant_values=padding)
        return x

In [18]:
import torch
from tqdm import tqdm
import numpy as np
roberta = get_model_function('roberta-base')

roberta模型与线性回归模型融合，每个roberta模型的权重在8~14之间，合在一起之后除以3

In [19]:
lesstoxic_valid_dataset = TestDataset(df_val['less_toxic'],maxlen=500)
lesstoxic_valid_loader = DataLoader(lesstoxic_valid_dataset,batch_size=16)
moretoxic_valid_dataset = TestDataset(df_val['more_toxic'],maxlen=500)
moretoxic_valid_loader = DataLoader(moretoxic_valid_dataset,batch_size=16)
test_dataset = TestDataset(df_sub['text'],maxlen=500)
test_loader = DataLoader(test_dataset,batch_size=16)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

100%|██████████| 30108/30108 [00:33<00:00, 897.30it/s]
100%|██████████| 30108/30108 [00:29<00:00, 1010.22it/s]
100%|██████████| 7537/7537 [00:07<00:00, 998.76it/s] 


In [20]:
model = torch.load('/kaggle/input/bestpointroberta/best_score0.7274436090225563split0-0.843.pth')
model = model.to(device)
model.eval()
less_toxic1,more_toxic1 = [],[]
result_toxic1 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic1 = less_toxic1+current_point.cpu().numpy().tolist()

for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic1 = more_toxic1+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic1 = result_toxic1+current_point.cpu().numpy().tolist()

100%|██████████| 1882/1882 [09:05<00:00,  3.45it/s]
100%|██████████| 1882/1882 [09:04<00:00,  3.45it/s]
100%|██████████| 472/472 [02:16<00:00,  3.46it/s]


In [21]:
model = torch.load('/kaggle/input/bestpointroberta/best_score0.7244756628413138seed13-0.838.pth')
model = model.to(device)
model.eval()
less_toxic2,more_toxic2 = [],[]
result_toxic2 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic2 = less_toxic2+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic2 = more_toxic2+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic2 = result_toxic2+current_point.cpu().numpy().tolist()

100%|██████████| 1882/1882 [09:05<00:00,  3.45it/s]
100%|██████████| 1882/1882 [09:05<00:00,  3.45it/s]
100%|██████████| 472/472 [02:16<00:00,  3.46it/s]


In [22]:
model = torch.load('/kaggle/input/bestpointroberta/best_score0.7270478828650574seed220.833.pth')
model = model.to(device)
model.eval()
less_toxic3,more_toxic3 = [],[]
result_toxic3 = []
for batch_token in tqdm(lesstoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        less_toxic3 = less_toxic3+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(moretoxic_valid_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        more_toxic3 = more_toxic3+current_point.cpu().numpy().tolist()
        
for batch_token in tqdm(test_loader):
    batch_token = batch_token.to(device)
    with torch.no_grad():
        current_point = model(batch_token)
        current_point = current_point.squeeze(-1)
        result_toxic3 = result_toxic3+current_point.cpu().numpy().tolist()

100%|██████████| 1882/1882 [09:07<00:00,  3.44it/s]
100%|██████████| 1882/1882 [09:07<00:00,  3.44it/s]
100%|██████████| 472/472 [02:17<00:00,  3.44it/s]


In [23]:
less_toxic1 = np.array(less_toxic1)
less_toxic2 = np.array(less_toxic2)
less_toxic3 = np.array(less_toxic3)

In [24]:
print("Find right weight roberta")
#w1,w2,w3,_ = sorted(wts_acc, key=lambda x:x[2], reverse=True)[0]

#p1_wt = w1*p1 + w2*p3 + w3*p5
#p2_wt = w1*p2 + w2*p4 + w3*p6
#p1_wt,p2_wt为之前保存的最好的验证集合的权重
wts_acc = []
for i in range(0,50):
    for j in range(0,50):
        for k in range(0,50):
        #右边边界应该为15
            i = i*0.1
            j = j*0.1
            k = k*0.1
            less_toxic_wt = np.array(less_toxic1).dot(i)+np.array(less_toxic2).dot(j)+np.array(less_toxic3).dot(k)
            #less_toxic_wt = i*less_toxic1 + j*less_toxic2 + k*less_toxic3
            less_toxic_wt = less_toxic_wt+p1_wt
            more_toxic_wt = np.array(more_toxic1).dot(i)+np.array(more_toxic2).dot(j)+np.array(more_toxic3).dot(k)
            more_toxic_wt = more_toxic_wt+p2_wt
            wts_acc.append((i,j,k, 
                         np.round((less_toxic_wt < more_toxic_wt).mean() * 100,2))
                      )
sorted(wts_acc, key=lambda x:x[3], reverse=True)[:5]
#新的结果不能使用w1,w2,w3，因为w1,w2,w3要作为最后的结果内容

Find right weight roberta


[(0.0, 0.0, 1.9000000000000001, 69.63),
 (0.0, 0.0, 2.4000000000000004, 69.63),
 (0.0, 1.0000000000000011e-20, 1.9000000000000001, 69.63),
 (0.0, 1.0000000000000014e-25, 2.4000000000000004, 69.63),
 (0.0, 2.0000000000000023e-20, 1.9000000000000001, 69.63)]

In [25]:
#w1,w2,w3,_ = sorted(wts_acc, key=lambda x:x[2], reverse=True)[0]
#p1_wt = w1*p1 + w2*p3 + w3*p5
#p2_wt = w1*p2 + w2*p4 + w3*p6
#前面计算出来的线性回归的w1,w2,w3是线性回归中最好的权重内容
w4,w5,w6,_ = sorted(wts_acc, key=lambda x:x[3], reverse=True)[0]
#易错点：这里的key=lambda一定为x:x[3]，去查看第三位的得分
df_val['p1'] = less_toxic_wt+p1_wt
df_val['p2'] = more_toxic_wt+p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)     

df_val['correct'] = (p1_wt < p2_wt).astype('int')
### Incorrect predictions with dis-similar scores
df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)
df_val.head()

Unnamed: 0,worker,less_toxic,more_toxic,p1,p2,diff,correct
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!!!!!!!!??????????????????????????????????????????????????????!!!!!!=\nWHER IS YOUR SEXY PIC GONE FROM YOUR MAIN PAGE???????? put it back,0.72971,1.034622,0.023155,1
1,188,"""And yes, people should recognize that but they usually don't. One of the first objections you hear directed at someone who says he favors free markets is invariably """"There's no such thing as a free market."""" Ridiculously trivial. ""","Daphne Guinness \n\nTop of the mornin' my favourite Alien Life Form. I'm sorry, I can't marry you because you are a poor commoner so, therefore, unimportant. Can I have my dresses back. I hope your fat Oirish arse hasn't stretched them.",0.228523,0.858688,0.101788,1
2,82,"Western Media?\n\nYup, because every crime in the entire world is caused by the west. Racists.","""Atom you don't believe actual photos of masturbation is sexually explicit photos. I am sorry to say they are sexually explicit and so you said yourself that the disclaimer is used for that purpose. Also, Wikipedia itself is targeted for minors. In addition, you say that Wikipedia does not have ...",0.501397,0.794345,0.047356,1
3,347,"And you removed it! You numbskull! I don't care what you say anymore, this is my life! Go ahead with your own life, leave me alone! —","You seem to have sand in your vagina.\n\nMight want to wash that stinking hole out, bitch.",0.82712,1.547908,0.157101,1
4,539,smelly vagina \n\nBluerasberry why don't you be a model for a cheesy blue vagina syndrome. A lot are experiencing this vaginal disease.,"hey \n\nway to support nazis, you racist",0.991378,1.110849,0.05662,1


In [26]:
print('w1 = %f,w2 = %f,w3 = %f\n'%(w1,w2,w3))
print('w4 = %d,w5 = %d,w6 = %d\n'%(w4,w5,w6))

w1 = 0.690000,w2 = 0.120000,w3 = 0.190000

w4 = 0,w5 = 0,w6 = 1



In [27]:
df_submission = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv')
df_submission['score'] = w1*test_preds_arr.mean(axis=1) + w2*test_preds_arr_.mean(axis=1) + w3*test_preds_arrc.mean(axis=1) + \
                         np.dot(np.array(result_toxic1),w4) + np.dot(np.array(result_toxic2),w5) + np.dot(np.array(result_toxic3),w6)
#df_submission['score'] = np.dot(np.array(result_toxic1),w4) + np.dot(np.array(result_toxic2),w5) + np.dot(np.array(result_toxic3),w6)
df_submission[['comment_id', 'score']].to_csv("submission.csv", index=False)