# Explore here

It's recommended to use this notebook for exploration purposes.

For example: 

1. You could import the CSV generated by python into your notebook and explore it.
2. You could connect to your database using `pandas.read_sql` from this notebook and explore it.

In [13]:
# %pip install pandas
# %pip install matplotlib
# %pip install sklearn

In [14]:
import pandas as pd
import pickle
import numpy as np
import re
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

**Step 1: Load your dataset and do the necessary transformations on your target variable.**

In [15]:
# df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')
# df_raw.to_csv('../data/raw/url_spam.csv')
df_raw = pd.read_csv('../data/raw/url_spam.csv')

In [16]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2999 non-null   int64 
 1   url         2999 non-null   object
 2   is_spam     2999 non-null   bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 49.9+ KB


In [17]:
df_raw['is_spam'].value_counts()

False    2303
True      696
Name: is_spam, dtype: int64

In [18]:
df = df_raw.copy()

In [19]:
df = df.drop_duplicates().reset_index(drop = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2999 non-null   int64 
 1   url         2999 non-null   object
 2   is_spam     2999 non-null   bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 49.9+ KB


In [20]:
# varias funciones

def comas(text):
    """
    Elimina comas del texto
    """
    return re.sub(',', ' ', text)

def espacios(text):
    """
    Elimina enters dobles por un solo enter
    """
    return re.sub(r'(\n{2,})','\n', text)

def minuscula(text):
    """
    Cambia mayusculas a minusculas
    """
    return text.lower()

def numeros(text):
    """
    Sustituye los numeros
    """
    return re.sub('([\d]+)', ' ', text)

def caracteres_no_alfanumericos(text):
    """
    Sustituye caracteres raros, no digitos y letras
    Ej. hola 'pepito' como le va? -> hola pepito como le va
    """
    return re.sub("(\\W)+"," ",text)

def comillas(text):
    """
    Sustituye comillas por un espacio
    Ej. hola 'pepito' como le va? -> hola pepito como le va?
    """
    return re.sub("'"," ", text)

def palabras_repetidas(text):
    """
    Sustituye palabras repetidas

    Ej. hola hola, como les va? a a ustedes -> hola, como les va? a ustedes
    """
    return re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

def esp_multiple(text):
    """
    Sustituye los espacios dobles entre palabras
    """
    return re.sub(' +', ' ',text)


#como utilizo estas funciones, ejemplo:
#df['texto_limpio'] = df['texto'].apply(espacios).apply(comas).apply(url).apply(minuscula).apply(esp_multiple).apply(comillas)
#df['texto_limpio'].values[:]


In [21]:
# funcón para eliminar https
def url(text):
    return re.sub(r'(https://www|https://)', '', text)

# se elimina https
df['url_limpia'] = df['url'].apply(url).apply(caracteres_no_alfanumericos).apply(esp_multiple)

#chequeo que lo elimino en column url_limpia
df.head()

Unnamed: 0.1,Unnamed: 0,url,is_spam,url_limpia
0,0,https://briefingday.us8.list-manage.com/unsubs...,True,briefingday us8 list manage com unsubscribe
1,1,https://www.hvper.com/,True,hvper com
2,2,https://briefingday.com/m/v4n3i4f3,True,briefingday com m v4n3i4f3
3,3,https://briefingday.com/n/20200618/m#commentform,False,briefingday com n 20200618 m commentform
4,4,https://briefingday.com/fan,True,briefingday com fan


In [22]:
#codifico la columna is_spam
df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x == True else 0)

In [23]:
#chequeo que se codifique is_spam
df.head()

Unnamed: 0.1,Unnamed: 0,url,is_spam,url_limpia
0,0,https://briefingday.us8.list-manage.com/unsubs...,1,briefingday us8 list manage com unsubscribe
1,1,https://www.hvper.com/,1,hvper com
2,2,https://briefingday.com/m/v4n3i4f3,1,briefingday com m v4n3i4f3
3,3,https://briefingday.com/n/20200618/m#commentform,0,briefingday com n 20200618 m commentform
4,4,https://briefingday.com/fan,1,briefingday com fan


**Step2: Use NLP techniques to preprocess the data.** 

In [24]:
vec = CountVectorizer().fit_transform(df['url_limpia'])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(vec, df['is_spam'], stratify = df['is_spam'], random_state = 2207)

**Step3: Use Support Vector machine to build a url spam classifier.**

In [26]:
classifier = SVC(C = 1.0, kernel = 'linear', gamma = 'auto')

In [27]:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       576
           1       0.86      0.93      0.89       174

    accuracy                           0.95       750
   macro avg       0.92      0.94      0.93       750
weighted avg       0.95      0.95      0.95       750



In [28]:
# optimizo hiperparámetros
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(random_state=1234),param_grid,verbose=2)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.3s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.1s
[CV] END .....................C=0.1, gamma=1, k

In [29]:
grid.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [30]:
grid.best_estimator_

In [31]:
best_model = grid.best_estimator_

In [32]:
predictions = grid.best_estimator_.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       576
           1       0.95      0.92      0.93       174

    accuracy                           0.97       750
   macro avg       0.96      0.95      0.96       750
weighted avg       0.97      0.97      0.97       750



In [33]:
pickle.dump(best_model, open('../models/best_model.pickle', 'wb'))