# TF-IDF Regression 

## Reference
- https://www.kaggle.com/leolu1998/jigsaw-ensemble-tfidf-bert#Create-Sklearn-Pipeline-with

## 1. load dataset

In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import set_config
import pickle

In [3]:
df = pd.read_csv("../data/train.csv")
df = df.fillna(0.0) # temporary fill nan to 0.0 
print(df.shape)
df.sample(5)

(1778813, 2)


Unnamed: 0,comment,score
405142,It seems just a little racist for Liberals to ...,0.159615
983968,"I hope you paid for it before showing your ""de...",0.19163
1315878,Jill Stein wants to cut the military budget by...,0.0
383462,"True, but either way the utility committees ar...",0.0
1162008,"Oh, I don't disagree with your assessment. Int...",0.0


In [4]:
df['score'].value_counts()

0.000000    1262685
0.132936     139877
0.159615      76845
0.239652      28837
0.666667      16405
             ...   
0.201899          1
0.254629          1
0.272608          1
0.408618          1
0.460053          1
Name: score, Length: 14089, dtype: int64

### Load Validation and Test data

In [7]:
# Validation data 
df_val = pd.read_csv("../data/4th/validation_data.csv")
# Test data
df_sub = pd.read_csv("../data/4th/comments_to_score.csv")

## 2. TF-IDF & regression
- TFIDF - Take 'char_wb' as analyzer to capture subwords well
- Ridge - Ridge is a simple regression algorithm that will reduce overfitting

In [18]:
val_preds_arr1 = np.zeros((df_val.shape[0], 1))
val_preds_arr2 = np.zeros((df_val.shape[0], 1))
test_preds_arr = np.zeros((df_sub.shape[0], 1))

# feature using tfidf
features = FeatureUnion([
        ("vect1", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
    ])

# pipeline using ridge regression
pipeline = Pipeline(
        [
            ("features", features),
            ("clf", Ridge()),
        ]
    )

set_config(display="diagram")

# set model
# Train the pipeline
pipeline.fit(df['comment'], df['score'])

In [34]:
# save model
with open('tfidf_ridge.pkl','wb') as f:
    pickle.dump(pipeline,f)

## 3. Validation
- final validation and submission

In [19]:
# validate and test
# What are the important features for toxicity
print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
								np.round(pipeline['clf'].coef_,2) )), 
						key = lambda x:x[1], 
						reverse=True)
print(feature_wts[:30])

print("\npredict validation data ")
val_preds_arr1[:,0] = pipeline.predict(df_val['less_toxic'])
val_preds_arr2[:,0] = pipeline.predict(df_val['more_toxic'])

print("\npredict test data ")
test_preds_arr[:,0] = pipeline.predict(df_sub['text'])




Total number of features: 873807
[('vect1__penis', 0.8), ('vect1__fuck', 0.78), ('vect1__shit', 0.78), ('vect1__fag', 0.71), ('vect1__ crap', 0.7), ('vect1__fuc', 0.69), ('vect1__ ass.', 0.68), ('vect1__balls', 0.66), ('vect1__ sh ', 0.64), ('vect1__oolis', 0.64), ('vect1__ ass ', 0.62), ('vect1__sticl', 0.62), ('vect1__ ass', 0.61), ('vect1__ ass,', 0.6), ('vect1__fools', 0.58), ('vect1__ jerk', 0.57), ('vect1__darn', 0.57), ('vect1__gay', 0.57), ('vect1__ liar', 0.56), ('vect1__silly', 0.56), ('vect1__ fool', 0.54), ('vect1__fool', 0.54), ('vect1__anal ', 0.53), ('vect1__bitch', 0.53), ('vect1__diot', 0.53), ('vect1__prick', 0.53), ('vect1__ anus', 0.52), ('vect1__ clow', 0.52), ('vect1__clown', 0.52), ('vect1__dick', 0.52)]

predict validation data 

predict test data 


In [20]:
print(" Toxic data ")
p1 = val_preds_arr1.mean(axis=1)
p2 = val_preds_arr2.mean(axis=1)

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

 Toxic data 
Validation Accuracy is 68.62


### Analyze bad predictions

In [23]:
df_val['p1'] = p1
df_val['p2'] = p2
df_val['diff'] = np.abs(p2 - p1)
df_val['correct'] = (p1 < p2).astype('int')

In [24]:
### Incorrect predictions with similar scores
df_val[df_val.correct == 0].sort_values('diff', ascending=True).head(20)

Unnamed: 0,worker,less_toxic,more_toxic,correct,p1,p2,diff
24702,204,This page is so conventional. It's sources are...,"""\n\nA more important point about Vuia is that...",0,0.126276,0.126274,2e-06
9704,605,Well... D I cant say I will join in on the I w...,Sexism \n\nI've tried my hardest to explain t...,0,0.040171,0.040115,5.6e-05
9703,515,Well... D I cant say I will join in on the I w...,Sexism \n\nI've tried my hardest to explain t...,0,0.040171,0.040115,5.6e-05
4571,220,"""\nI would like to agree with you, and maybe I...",THEIR GOING TO BATH \n\nBUT THEY'RE A MUSLIM ...,0,0.150288,0.150198,9e-05
4569,387,"""\nI would like to agree with you, and maybe I...",THEIR GOING TO BATH \n\nBUT THEY'RE A MUSLIM ...,0,0.150288,0.150198,9e-05
4570,441,"""\nI would like to agree with you, and maybe I...",THEIR GOING TO BATH \n\nBUT THEY'RE A MUSLIM ...,0,0.150288,0.150198,9e-05
26873,451,JLaTondre is a gay boy who likes to suck dick...,Thank You your the Wellcome\nARABIC ASSHOLR GO...,0,0.36339,0.363255,0.000136
1934,216,"""\n\nMorrison's """"Black President"""" Phrase Was...",Hey RGT: you're not just simply disagreeing. ...,0,0.129691,0.129503,0.000188
1935,168,"""\n\nMorrison's """"Black President"""" Phrase Was...",Hey RGT: you're not just simply disagreeing. ...,0,0.129691,0.129503,0.000188
1936,180,"""\n\nMorrison's """"Black President"""" Phrase Was...",Hey RGT: you're not just simply disagreeing. ...,0,0.129691,0.129503,0.000188


In [25]:
### Incorrect predictions with dis-similar scores
df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)

Unnamed: 0,worker,less_toxic,more_toxic,correct,p1,p2,diff
25062,246,CaliforniaAliBaba is a Bitch\nGo fuck yourself...,I've heard a snippet on their myspace page but...,0,1.275426,-0.001895,1.277321
28241,118,you are a stupid fuck \n\nand your mother's c...,"""\n\n ::: According to Stratfordian authori...",0,1.184728,0.049904,1.134825
1952,431,ASSHOLES! COME AND TRY ME YOU STUPID FUCK SON ...,"""\n\nPlease, Cheesebot is not nonsense. It is ...",0,1.215474,0.11265,1.102823
19411,46,"Ok, what is it \n\nWhy are you such a queer.....",wow the guy who deleated the stuff on the Habb...,0,1.161545,0.069165,1.092379
21059,333,fuck you \n\nyou're an idiot,What makes you an editor? Why do you use the n...,0,1.125096,0.070252,1.054844
18002,634,"Are you lost, stupid fucking idiots","Appeal \n\n{{unblock|G'day everyone. First, I...",0,1.124004,0.07578,1.048224
7788,580,suck my dick \n\nfuck you dirty whore bitch,Stop being such a sissy.,0,1.362443,0.31921,1.043232
11226,320,I fucked your dad last night you fucking whor...,Please don't try and mark a page for deletion ...,0,1.071216,0.067594,1.003623
20191,504,cunt. I hate your face. You are a fucking shit...,"""\n\nAll my uploaded pdf-documents have been u...",0,1.152207,0.158813,0.993394
25754,190,what the hell man? why did you delete my page ...,""" December 2009 (UTC)\n\n Now there's typical ...",0,1.186038,0.199026,0.987012


## 4. Predict on test data

In [26]:
# Predict using pipeline
df_sub['score'] = test_preds_arr.mean(axis=1)

In [27]:
# Cases with duplicates scores
df_sub['score'].count() - df_sub['score'].nunique()

12

In [28]:
same_score = df_sub['score'].value_counts().reset_index()[:10]
same_score

Unnamed: 0,index,score
0,0.572927,2
1,0.464975,2
2,0.303382,2
3,0.022798,2
4,0.230748,2
5,0.130651,2
6,0.0645,2
7,0.124117,2
8,0.230264,2
9,0.14535,2


In [29]:
df_sub[df_sub['score'].isin(same_score['index'].tolist())]

Unnamed: 0,comment_id,text,score
1832,95080362,"""\n\nPlease do not add nonsense to Wikipedia. ...",0.022798
2842,160935265,"""\n\nPlease do not add nonsense to Wikipedia. ...",0.022798
4832,275797183,Hi\n\nCould you please learn to interact like ...,0.0645
4833,275812977,Could you please learn to interact like a sent...,0.0645
5140,298854514,"her!\n\nPoop, pee, toot, fart, gas, diareah!\n...",0.464975
5190,301925517,"her!\n\nPoop, pee, toot, fart, gas, diareah!\n...",0.464975
5752,339478276,I'm gonna beat you to a bloody pulp then sho...,0.230748
5753,339478966,I'm gonna beat you to a bloody pulp then shoo...,0.230748
5832,345043812,JIMBO SAID I COULD EDIT HIS PAGE. YOU ARE A MO...,0.303382
5833,345043888,JIMBO SAID I COULD EDIT HIS PAGE. YOU ARE A M...,0.303382


In [30]:
df_sub.sample(5)

Unnamed: 0,comment_id,text,score
3294,186197494,"""\nFor copying and pasting of what I felt stro...",0.141626
2167,116257386,Dude! \nThat was an attempt at saying somethi...,0.160497
7070,457417171,You simply display your ignorance. Fatuorum,0.274591
4347,242591983,"""\n\nSockpuppetry case\n \nYou have been accus...",0.012607
1370,70880071,Now let's see who's gonna start crying like a ...,0.279953


In [31]:
# save submission
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)