In [1]:
import pandas as pd
import numpy as np
#import mysql.connector
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
#python -m nltk.downloader
from google.cloud import bigquery



In [2]:
client = bigquery.Client()

In [3]:
# sql = """
# SELECT *  FROM `mimetic-coral-355913.dwh.comment`
# """
# df = client.query(sql).to_dataframe()
# df.head()

In [4]:
df = pd.read_csv("gs://the-clean-project/train.csv")
df.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [5]:
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)
#juste ça à sauvergarder et mettre dans une table

Unnamed: 0,text,y
32044,u cant block me i no where u live.if u block m...,1
48329,Fine. I will wait and watch for a while. You c...,0
34466,"Look, I've returned every serve you've sent ov...",0
126048,"""citation needed ==\n\n""""To this day the Sierr...",0
44350,"""\n\nJim:\n\nI posted a new subject but don't ...",0


In [6]:
df['y'].value_counts(normalize=True)
min_len = (df['y'] == 1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] == 1], df_y0_undersample])
df['y'].value_counts()

from nltk.stem import SnowballStemmer, WordNetLemmatizer
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
all_stopwords = stopwords.words('english')

def clean(comment):
    comment = re.sub('[^a-zA-Z]', ' ', comment)
    comment = comment.lower()
    comment = comment.split()
    comment = [stemmer.stem(word) for word in comment if not word in set(all_stopwords)]
    comment = [lemmatizer.lemmatize(word) for word in comment]
    comment = ' '.join(comment)
    return comment

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
df['text'].iloc[0]

'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'

In [8]:
clean(df['text'].iloc[0])

'cocksuck piss around work'

In [9]:
df['text'] = df['text'].apply(clean)

## TF-IDF

In [10]:
vec = TfidfVectorizer()

In [11]:
X = vec.fit_transform(df['text'])
X

<32450x45777 sparse matrix of type '<class 'numpy.float64'>'
	with 728904 stored elements in Compressed Sparse Row format>

## Fit Naive Bayes

In [12]:
model = MultinomialNB()
model.fit(X, df['y'])

MultinomialNB()

## Validate

In [13]:
# Le fichier validation est chargé dans le bucket dans le même dossier que train.csv !   ;-)
df_val = pd.read_csv("gs://the-clean-project/validation_data.csv")

X_less_toxic = vec.transform(df_val['less_toxic'].apply(clean))
X_more_toxic = vec.transform(df_val['more_toxic'].apply(clean))

p1 = model.predict_proba(X_less_toxic)
p2 = model.predict_proba(X_more_toxic)

# Validation Accuracy
(p1[:, 1] < p2[:, 1]).mean()

0.6675634382888269

## Application on our comments

### Importer les colonnes qui nous intéressent, à savoir ici : id_comment, content

In [14]:
sql_jig = """
SELECT id_comment, content  FROM `mimetic-coral-355913.dwh.comment`
"""
df_jig = client.query(sql_jig).to_dataframe()
df_jig.head()

Unnamed: 0,id_comment,content
0,3d9g31,The moment you realise Pluto (the dog) is on P...
1,5jkp89,Pluto Weather Forecast (x-post r/space)
2,5pak7w,"Congratulations, /r/pluto! You are Tiny Subred..."
3,65b14h,Google loves Pluto! &lt;3
4,7oj41n,"Congratulations, /r/Pluto! You are Subreddit o..."


### Faut sûrement nettoyer les contents avant d'appliquer le modèle ? (fonction clean ci-dessus)

In [15]:
X_test = vec.transform(df_jig['content'])

### Appliquer le modèle sur nos données

In [16]:
p3 = model.predict_proba(X_test)

In [17]:
df_jig['score_jigsaw'] = p3[:,1]
del df_jig['content']
df_jig
# len(df_jig)

Unnamed: 0,id_comment,score_jigsaw
0,3d9g31,0.494898
1,5jkp89,0.252511
2,5pak7w,0.549515
3,65b14h,0.521643
4,7oj41n,0.549515
...,...,...
222024,x9d6y1,0.500000
222025,x9dfxr,0.500000
222026,x9f718,0.500000
222027,x9hxng,0.500000


In [None]:
sql_j = """
SELECT id_comment, type_content  FROM `mimetic-coral-355913.dwh.comment`
"""
df_j = client.query(sql_j).to_dataframe()
df_j.head()

In [32]:
len(df_j)

222029

In [33]:
df_jig=df_jig.merge(df_j, left_on='id_comment', right_on='id_comment')
df_jig


Unnamed: 0,id_comment,score_jigsaw,type_content_x,id_post,id_author,author,content,type_content_y,ups,downs,score,created_utc,extraction_utc,score_jig
0,3d9g31,0.494898,Title,3d9g31,t2_cuesk,AlanZucconi,The moment you realise Pluto (the dog) is on P...,Title,45,0,45,2015-07-14 15:19:36,2022-09-09 13:45:07,0.500000
1,3d9g31,0.494898,Title,3d9g31,t2_cuesk,AlanZucconi,,Post,45,0,45,2015-07-14 15:19:36,2022-09-09 13:45:07,0.500000
2,3d9g31,0.494898,Post,3d9g31,t2_cuesk,AlanZucconi,The moment you realise Pluto (the dog) is on P...,Title,45,0,45,2015-07-14 15:19:36,2022-09-09 13:45:07,0.500000
3,3d9g31,0.494898,Post,3d9g31,t2_cuesk,AlanZucconi,,Post,45,0,45,2015-07-14 15:19:36,2022-09-09 13:45:07,0.500000
4,3d9g31,0.500000,Title,3d9g31,t2_cuesk,AlanZucconi,The moment you realise Pluto (the dog) is on P...,Title,45,0,45,2015-07-14 15:19:36,2022-09-09 13:45:07,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288594,inqgnh8,0.259968,Comment,x9xtpc,t2_4csrvec4,Monster_Jerms,I mean- it’ll probably end up being the defaul...,Comment,-1,0,-1,2022-09-09 15:04:55,2022-09-09 17:53:36,0.259968
288595,inr037e,0.668623,Comment,x9y44i,t2_8bsya,GERBILSAURUSREX,Yeah I hope there are achievement endings that...,Comment,-1,0,-1,2022-09-09 17:13:29,2022-09-09 17:53:36,0.668623
288596,inquk27,0.426407,Comment,x9zu8s,t2_bdq6a,LockedDoor_,Shocking they could do it with near triple the...,Comment,-1,0,-1,2022-09-09 16:37:02,2022-09-09 18:23:30,0.426407
288597,inr9hw6,0.668213,Comment,xa2aa6,t2_q3l5f2gv,Loumind555,I donot know but here a upvote,Comment,-1,0,-1,2022-09-09 18:15:32,2022-09-09 19:31:35,0.668213


## Tâche finale : enregistrer les scores ainsi calculés dans le DWH

In [20]:
# dataset_id = 'dwh'
# # For this sample, the table must already exist and have a defined schema
# table_id ="score_jig"
# table_ref = client.dataset(dataset_id).table(table_id)
# table = client.get_table(table_ref)
# client.insert_rows(table, df_jig.values.tolist())


In [21]:
#from google.cloud import bigquery

#client = bigquery.Client()

#dml_statement = ("UPDATE `dwh.test` SET colonne2 = 'A' WHERE colonne1 = 2;")
#print(dml_statement)
#query_job = client.query(dml_statement)  # API request
#query_job.result()  # Waits for statement to finish

In [22]:
#pip install pandas-gbq

In [23]:
import pandas_gbq

In [24]:
pandas_gbq.to_gbq(df_jig, "dwh.score_jig", project_id="mimetic-coral-355913", if_exists="replace")

In [25]:
# def bq_load(key, value):
  
#   project_name = 'YOUR PROJECT NAME'
#   dataset_name = 'YOUR DATASET NAME'
#   table_name = key
  
#   value.to_gbq(destination_table='{}.{}'.format(dataset_name, table_name), project_id=project_name, if_exists='replace')
# bq_load("mimetic-coral-355913.dwh.score_jig",df_jig)

In [26]:
# from google.cloud import bigqueryclient = bigquery.Client()

# assert table.encryption_configuration.kms_key_name == original_kms_key_name

# # Set a new encryption key to use for the destination.
# # TODO: Replace this key with a key you have created in KMS.
# updated_kms_key_name = (
#     ""
# )
# table.encryption_configuration = bigquery.EncryptionConfiguration(
#     kms_key_name=updated_kms_key_name
# )

# table = client.update_table(table, ["encryption_configuration"])  # API request

# assert table.encryption_configuration.kms_key_name == updated_kms_key_name
# assert original_kms_key_name != updated_kms_key_name

In [27]:
# sql = """
# SELECT *  FROM `mimetic-coral-355913.dwh.comment`
# """
# df15 = client.query(sql).to_dataframe()
# df15.head()