# Kaggle Comp Jigsaw Rules Classification

### Dependencies

In [25]:
import os
import re
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

### Loading the Data

In [2]:
train_df = pd.read_csv(os.path.join("Data/", 'train.csv'))
train_df.set_index('row_id', inplace=True)
train_df.head()

Unnamed: 0_level_0,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2,rule_violation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Banks don't want you to know this! Click here ...,"No Advertising: Spam, referral links, unsolici...",Futurology,If you could tell your younger self something ...,hunt for lady for jack off in neighbourhood ht...,Watch Golden Globe Awards 2017 Live Online in ...,"DOUBLE CEE x BANDS EPPS - ""BIRDS""\n\nDOWNLOAD/...",0
1,SD Stream [ ENG Link 1] (http://www.sportsstre...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,[I wanna kiss you all over! Stunning!](http://...,LOLGA.COM is One of the First Professional Onl...,#Rapper \n🚨Straight Outta Cross Keys SC 🚨YouTu...,[15 Amazing Hidden Features Of Google Search Y...,0
2,Lol. Try appealing the ban and say you won't d...,No legal advice: Do not offer or request legal...,pcmasterrace,Don't break up with him or call the cops. If ...,It'll be dismissed: https://en.wikipedia.org/w...,Where is there a site that still works where y...,Because this statement of his is true. It isn'...,1
3,she will come your home open her legs with an...,"No Advertising: Spam, referral links, unsolici...",sex,Selling Tyrande codes for 3€ to paypal. PM. \n...,tight pussy watch for your cock get her at thi...,NSFW(obviously) http://spankbang.com/iy3u/vide...,Good News ::Download WhatsApp 2.16.230 APK for...,1
4,code free tyrande --->>> [Imgur](http://i.imgu...,"No Advertising: Spam, referral links, unsolici...",hearthstone,wow!! amazing reminds me of the old days.Well...,seek for lady for sex in around http://p77.pl/...,must be watch movie https://sites.google.com/s...,We're streaming Pokemon Veitnamese Crystal RIG...,1


In [3]:
train_df.isnull().sum()

body                  0
rule                  0
subreddit             0
positive_example_1    0
positive_example_2    0
negative_example_1    0
negative_example_2    0
rule_violation        0
dtype: int64

In [4]:
train_df["body"] = train_df["body"].map(lambda x: re.sub(r"http[s]?://\S+", "[LINK]", x))
train_df["positive_example_1"] = train_df["positive_example_1"].map(lambda x: re.sub(r"http[s]?://\S+", "[LINK]", x))
train_df["positive_example_2"] = train_df["positive_example_2"].map(lambda x: re.sub(r"http[s]?://\S+", "[LINK]", x))
train_df["negative_example_1"] = train_df["negative_example_1"].map(lambda x: re.sub(r"http[s]?://\S+", "[LINK]", x))
train_df["negative_example_2"] = train_df["negative_example_2"].map(lambda x: re.sub(r"http[s]?://\S+", "[LINK]", x))

train_df["body"] = train_df["body"].map(lambda x: re.sub(r"[^\s\w#\]\[]", " ", x))
train_df["positive_example_1"] = train_df["positive_example_1"].map(lambda x: re.sub(r"[^\s\w#\]\[]", " ", x))
train_df["positive_example_2"] = train_df["positive_example_2"].map(lambda x: re.sub(r"[^\s\w#\]\[]", " ", x))
train_df["negative_example_1"] = train_df["negative_example_1"].map(lambda x: re.sub(r"[^\s\w#\]\[]", " ", x))
train_df["negative_example_2"] = train_df["negative_example_2"].map(lambda x: re.sub(r"[^\s\w#\]\[]", " ", x))

train_df["body"] = train_df["body"].map(lambda x: re.sub(r"\s\s+", " ", x))
train_df["positive_example_1"] = train_df["positive_example_1"].map(lambda x: re.sub(r"\s\s+", " ", x))
train_df["positive_example_2"] = train_df["positive_example_2"].map(lambda x: re.sub(r"\s\s+", " ", x))
train_df["negative_example_1"] = train_df["negative_example_1"].map(lambda x: re.sub(r"\s\s+", " ", x))
train_df["negative_example_2"] = train_df["negative_example_2"].map(lambda x: re.sub(r"\s\s+", " ", x))

train_df.head()


Unnamed: 0_level_0,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2,rule_violation
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Banks don t want you to know this Click here t...,"No Advertising: Spam, referral links, unsolici...",Futurology,If you could tell your younger self something ...,hunt for lady for jack off in neighbourhood [L...,Watch Golden Globe Awards 2017 Live Online in ...,DOUBLE CEE x BANDS EPPS BIRDS DOWNLOAD STREAM ...,0
1,SD Stream [ ENG Link 1] [LINK],"No Advertising: Spam, referral links, unsolici...",soccerstreams,[I wanna kiss you all over Stunning ] [LINK],LOLGA COM is One of the First Professional Onl...,#Rapper Straight Outta Cross Keys SC YouTube S...,[15 Amazing Hidden Features Of Google Search Y...,0
2,Lol Try appealing the ban and say you won t do...,No legal advice: Do not offer or request legal...,pcmasterrace,Don t break up with him or call the cops If yo...,It ll be dismissed [LINK] The first amendment ...,Where is there a site that still works where y...,Because this statement of his is true It isn t...,1
3,she will come your home open her legs with and...,"No Advertising: Spam, referral links, unsolici...",sex,Selling Tyrande codes for 3 to paypal PM [LINK],tight pussy watch for your cock get her at thi...,NSFW obviously [LINK],Good News Download WhatsApp 2 16 230 APK for A...,1
4,code free tyrande [Imgur] [LINK] for you and y...,"No Advertising: Spam, referral links, unsolici...",hearthstone,wow amazing reminds me of the old days Well D...,seek for lady for sex in around [LINK],must be watch movie [LINK],We re streaming Pokemon Veitnamese Crystal RIG...,1


In [12]:
train_df[["body", "rule"]]

Unnamed: 0_level_0,body,rule
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Banks don t want you to know this Click here t...,"No Advertising: Spam, referral links, unsolici..."
1,SD Stream [ ENG Link 1] [LINK],"No Advertising: Spam, referral links, unsolici..."
2,Lol Try appealing the ban and say you won t do...,No legal advice: Do not offer or request legal...
3,she will come your home open her legs with and...,"No Advertising: Spam, referral links, unsolici..."
4,code free tyrande [Imgur] [LINK] for you and y...,"No Advertising: Spam, referral links, unsolici..."
...,...,...
2024,Please edit your post so it is readable These ...,No legal advice: Do not offer or request legal...
2025,Yes and in a right to work state they can even...,No legal advice: Do not offer or request legal...
2026,HD Streams ENG HD [ Watch here PC Mobile ] [L...,"No Advertising: Spam, referral links, unsolici..."
2027,No Not when doing so obviously presents a safe...,No legal advice: Do not offer or request legal...


In [15]:
X = train_df[["body", "rule","subreddit","positive_example_1","positive_example_2","negative_example_1","negative_example_2"]].astype(str).agg(' '.join, axis=1)
y = train_df['rule_violation']

model = Pipeline([
    ('cv', CountVectorizer()),
    ('clf', LogisticRegression())
])

model.fit(X, y)
model


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
joblib.dump(model,os.path.join("TrainedModels","cvlr-o1.bin"))

['TrainedModels/cvlr-o1.bin']