In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")


pd.options.display.max_colwidth=300

In [2]:
import pickle
from joblib import dump,load

In [3]:
df_val = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
df_sub = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [4]:
n_folds = 7
val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
test_preds_arr = np.zeros((df_sub.shape[0], n_folds))

In [5]:
for fld in range(n_folds):
    pipeline = load('/kaggle/input/toxiclinearregression/withclean'+str(fld)+'.joblib')
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])
    
    print("\npredict test data ")
    test_preds_arr[:,fld] = pipeline.predict(df_sub['text'])
    


Total number of features: 75032
[('vect3__uck', 0.39),
 ('vect3__ f ', 0.36),
 ('vect3__fuc', 0.32),
 ('vect3__fuck', 0.31),
 ('vect3__shit', 0.31),
 ('vect3__ ass', 0.28),
 ('vect3__ass', 0.28),
 ('vect3__hit', 0.28),
 ('vect3__ rape', 0.27),
 ('vect3__ fu', 0.26),
 ('vect3__ nl', 0.26),
 ('vect3__nl3', 0.26),
 ('vect3__ g ', 0.24),
 ('vect3__nig', 0.24),
 ('vect3__bitch', 0.23),
 ('vect3__fag', 0.23),
 ('vect3__suck', 0.23),
 ('vect3__ fuc', 0.22),
 ('vect3__ gay', 0.22),
 ('vect3__ rap', 0.22),
 ('vect3__ shit', 0.22),
 ('vect3__rape ', 0.22),
 ('vect3__ fag', 0.21),
 ('vect3__ fuck', 0.21),
 ('vect3__bitc', 0.21),
 ('vect3__nigg', 0.21),
 ('vect3__ * ', 0.2),
 ('vect3__ as', 0.2),
 ('vect3__ nig', 0.2),
 ('vect3__ie ', 0.2)]

predict validation data 

predict test data 

Total number of features: 74635
[('vect3__uck', 0.42),
 ('vect3__fuc', 0.33),
 ('vect3__fuck', 0.3),
 ('vect3__ f ', 0.29),
 ('vect3__ fu', 0.29),
 ('vect3__ nl', 0.29),
 ('vect3__ck ', 0.27),
 ('vect3__ie ', 0.27

In [6]:
val_preds_arr1c = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2c = np.zeros((df_val.shape[0], n_folds))
test_preds_arrc = np.zeros((df_sub.shape[0], n_folds))

In [7]:
for fld in range(n_folds):
    pipeline = load('/kaggle/input/toxiclinearregression/withoutclean'+str(fld)+'.joblib')
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1c[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2c[:,fld] = pipeline.predict(df_val['more_toxic'])
    
    print("\npredict test data ")
    test_preds_arrc[:,fld] = pipeline.predict(df_sub['text'])


Total number of features: 80874
[('vect3__uck', 0.43),
 ('vect3__fuc', 0.36),
 ('vect3__fuck', 0.32),
 ('vect3__you ', 0.32),
 ('vect3__ ass', 0.31),
 ('vect3__ass', 0.3),
 ('vect3__shit', 0.3),
 ('vect3__ you ', 0.28),
 ('vect3__hit', 0.28),
 ('vect3__ fu', 0.27),
 ('vect3__ nl', 0.26),
 ('vect3__ f ', 0.25),
 ('vect3__ rape', 0.25),
 ('vect3__nl3', 0.25),
 ('vect3__ g ', 0.24),
 ('vect3__ u ', 0.24),
 ('vect3__bitch', 0.24),
 ('vect3__fag', 0.24),
 ('vect3__nig', 0.24),
 ('vect3__ fag', 0.23),
 ('vect3__ gay', 0.23),
 ('vect3__nigg', 0.23),
 ('vect3__ fuc', 0.22),
 ('vect3__ shit', 0.22),
 ('vect3__bitc', 0.22),
 ('vect3__suck', 0.22),
 ('vect3__uck ', 0.22),
 ('vect3__wwwww', 0.22),
 ('vect3__ rap', 0.21),
 ('vect3__!!!', 0.21)]

predict validation data 

predict test data 

Total number of features: 80314
[('vect3__uck', 0.46),
 ('vect3__fuc', 0.34),
 ('vect3__fuck', 0.3),
 ('vect3__ fu', 0.28),
 ('vect3__ nl', 0.28),
 ('vect3__l33', 0.28),
 ('vect3__ ass', 0.27),
 ('vect3__ die '

In [8]:
val_preds_arr1_ = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2_ = np.zeros((df_val.shape[0], n_folds))
test_preds_arr_ = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    pipeline = load('/kaggle/input/rudditridgeregression/withruddit'+str(fld)+'.joblib')
    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    dump(pipeline,'/kaggle/working/withruddit'+str(fld)+'.joblib')
    print("\npredict validation data ")
    val_preds_arr1_[:,fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2_[:,fld] = pipeline.predict(df_val['more_toxic'])

    print("\npredict test data ")
    test_preds_arr_[:,fld] = pipeline.predict(df_sub['text'])

[('vect3__fuc', 0.59),
 ('vect3__fuck', 0.59),
 ('vect3__uck', 0.55),
 ('vect3__ fuc', 0.52),
 ('vect3__ fuck', 0.52),
 ('vect3__ fu', 0.42),
 ('vect3__shit', 0.4),
 ('vect3__hit', 0.37),
 ('vect3__ shit', 0.36),
 ('vect3__fuck ', 0.34),
 ('vect3__ck ', 0.32),
 ('vect3__uck ', 0.31),
 ('vect3__shi', 0.29),
 ('vect3__ shi', 0.28),
 ('vect3__ ass ', 0.27),
 ('vect3__sex', 0.26),
 ('vect3__ ass', 0.25),
 ('vect3__ dick', 0.25),
 ('vect3__dick', 0.25),
 ('vect3__ dic', 0.24),
 ('vect3__ sex', 0.24),
 ('vect3__ sh', 0.23),
 ('vect3__shit ', 0.23),
 ('vect3__ass', 0.21),
 ('vect3__ di', 0.19),
 ('vect3__ du', 0.18),
 ('vect3__!!!', 0.18),
 ('vect3__dic', 0.18),
 ('vect3__ex ', 0.18),
 ('vect3__hit ', 0.18)]

predict validation data 

predict test data 
[('vect3__fuc', 0.58),
 ('vect3__fuck', 0.58),
 ('vect3__ fuc', 0.53),
 ('vect3__ fuck', 0.53),
 ('vect3__uck', 0.53),
 ('vect3__ fu', 0.41),
 ('vect3__shit', 0.38),
 ('vect3__hit', 0.36),
 ('vect3__ shit', 0.35),
 ('vect3__ck ', 0.35),
 ('vec

In [9]:
print(" Toxic data ")
p1 = val_preds_arr1.mean(axis=1)
p2 = val_preds_arr2.mean(axis=1)

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

print(" Ruddit data ")
p3 = val_preds_arr1_.mean(axis=1)
p4 = val_preds_arr2_.mean(axis=1)

print(f'Validation Accuracy is { np.round((p3 < p4).mean() * 100,2)}')

print(" Toxic CLEAN data ")
p5 = val_preds_arr1c.mean(axis=1)
p6 = val_preds_arr2c.mean(axis=1)

print(f'Validation Accuracy is { np.round((p5 < p6).mean() * 100,2)}')

 Toxic data 
Validation Accuracy is 67.47
 Ruddit data 
Validation Accuracy is 62.57
 Toxic CLEAN data 
Validation Accuracy is 68.17


In [10]:
print("Find right weight")

wts_acc = []
for i in range(30,70,1):
    for j in range(0,20,1):
        w1 = i/100
        w2 = (100 - i - j)/100
        w3 = (1 - w1 - w2 )
        p1_wt = w1*p1 + w2*p3 + w3*p5
        p2_wt = w1*p2 + w2*p4 + w3*p6
        wts_acc.append( (w1,w2,w3, 
                         np.round((p1_wt < p2_wt).mean() * 100,2))
                      )
sorted(wts_acc, key=lambda x:x[3], reverse=True)[:5]

Find right weight


[(0.69, 0.12, 0.19000000000000006, 67.93),
 (0.68, 0.13, 0.18999999999999995, 67.92),
 (0.69, 0.13, 0.18000000000000005, 67.92),
 (0.65, 0.16, 0.18999999999999997, 67.91),
 (0.67, 0.14, 0.18999999999999995, 67.91)]

In [11]:


w1,w2,w3,_ = sorted(wts_acc, key=lambda x:x[3], reverse=True)[0]
#print(best_wts)

p1_wt = w1*p1 + w2*p3 + w3*p5
p2_wt = w1*p2 + w2*p4 + w3*p6


In [12]:
df_val['p1'] = p1_wt
df_val['p2'] = p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)

df_val['correct'] = (p1_wt < p2_wt).astype('int')
### Incorrect predictions with dis-similar scores
df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)


Unnamed: 0,worker,less_toxic,more_toxic,p1,p2,diff,correct
25062,246,CaliforniaAliBaba is a Bitch\nGo fuck yourself you bitch ass motherfucker. Don't you ever in your fucking life threatening me you fucking piece of shit you. You don't know me I will fucking kill your ass you punk pussy bitch. tell your mama to my black dick.,I've heard a snippet on their myspace page but I'm afraid I haven't given them much though musically as of yet. Perhaps I will be purchasing one of their albums since I seem to spend about a third of my time on wiki reverting their vandalism.,1.174676,0.189535,0.98514,0
25754,190,what the hell man? why did you delete my page (savannah hoard) screw you dude\n\ni mean really? if you're not gonna let me have a page then you can go fuck your fat self you 40 year old bitch\n\nyour a total bitch who goes around deleting people's pages you fucking disgust me im gonna kick your ...,""" December 2009 (UTC)\n\n Now there's typical AGW alarmist Watermelon argumentum ad hominem content-vacant suppressive authoritarian WikiNazi rottenness if ever it got posted online. Don't address Monckton's (or Nsaa's) position, but strive (with utter ineffectiveness, I might add) to fault the ...",1.002434,0.267199,0.735235,0
19411,46,"Ok, what is it \n\nWhy are you such a queer.. you have no power, so why the FUCK are you acting like you do. Grow up you fucking stupid shit head cock eating bastard.. seriously! Get a life, you're just a 17 yo piece of shit.",wow the guy who deleated the stuff on the Habbo NSM is leet. 88!,0.95865,0.251949,0.706701,0
29056,204,"Fuck You\n\nGo suck on a fat dick faggot mother fucker. I'm the the wrong person to fuck with you puck ass pussy! I would keep my moth shut, or I can shut for you!","""\n\nFUCCK U A$$HOLE do wateva u want DIICK HEAD Contact Blogsd ! """,1.117289,0.412428,0.70486,0
29057,716,"Fuck You\n\nGo suck on a fat dick faggot mother fucker. I'm the the wrong person to fuck with you puck ass pussy! I would keep my moth shut, or I can shut for you!","""\n\nFUCCK U A$$HOLE do wateva u want DIICK HEAD Contact Blogsd ! """,1.117289,0.412428,0.70486,0
17175,179,"Gon Be A Snitch \n\nYou are a little bitch. I fuckin spent 6 hours on big sam (rapper) an you call it fuckin vandolism, hard ass aint you, fuck you i'm makin a true ass page an you wack off an delete my page. What the fuck is up, your like a snitch but to yourself. I'll fuckin edit your bitch a...",INDIAN ANUS IS STRETCHED AND WIDENED BY GRAWP’S MASSIVE COCK.,1.023101,0.332049,0.691051,0
17193,238,fuck this this is bull shit and it does not even say where shit comes from your fucking ass,Just To Let You Know. You have no life.\nI hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I ...,1.010876,0.332671,0.678205,0
17192,347,fuck this this is bull shit and it does not even say where shit comes from your fucking ass,Just To Let You Know. You have no life.\nI hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I hate you.I ...,1.010876,0.332671,0.678205,0
7788,580,suck my dick \n\nfuck you dirty whore bitch,Stop being such a sissy.,0.99858,0.330428,0.668152,0
21059,333,fuck you \n\nyou're an idiot,"What makes you an editor? Why do you use the name of a great dead man? What is your true name, profession, country? \nShamir",0.95656,0.290622,0.665938,0


In [13]:
df_submission = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv')
df_submission['score'] = w1*test_preds_arr.mean(axis=1) + w2*test_preds_arr_.mean(axis=1) + w3*test_preds_arrc.mean(axis=1)
df_submission[['comment_id', 'score']].to_csv("submission.csv", index=False)