In [None]:


############################################################
########################################### Team Challenge
############################################################
# 
## Work in Project Groups
# 
# - tokenize the dataset on Big Query from 
# URL link: https://console.cloud.google.com/bigquery?project=questrom&d=SMSspam&

## review the slides at the end of this module
## predict spam
## objective =  based on accuracy
## only input is text, but you can derive features
## limited time, but how do you maximize your time (and the model?)
## HINTS:
##        start small, simple models
##        iterate and see how you do against the leaderboard
##        code above helps you with the core mechanics


In [None]:
# installs/updates
! pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 1.8 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-1.0.1 threadpoolctl-3.0.0


In [None]:
# imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# text
from sklearn.feature_extraction.text import CountVectorizer

# compression
from sklearn.decomposition import PCA




In [None]:
# get the datasets
ds_train = pd.read_gbq("SELECT * FROM `questrom.SMSspam.train`", "questrom")
ds_test = pd.read_gbq("SELECT * FROM `questrom.SMSspam.test`", "questrom")
ds_sample = pd.read_gbq("SELECT * FROM `questrom.SMSspam.sample-submission`", "questrom")

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=W1L6ya7f2RUuWwwNizJyGpLWdBFS6f&prompt=consent&access_type=offline
Enter the authorization code: 4/1AX4XfWhwHDgC4M4LVByuSXSFJO2-7-e5yggCsOvU9_iC6PSbi6XZswBpcuI


In [None]:
# note the sample submission  <---- this is needed for the submission
ds_sample.sample(3)

Unnamed: 0,id,label
136,1314,ham
415,4132,ham
204,2107,ham


In [None]:
# example upload
ds_sample.to_csv("example.csv", index=False)

In [None]:
ds_train.sample(3)

Unnamed: 0,message,label,id
1486,"Under the sea, there lays a rock. In the rock,...",ham,1947
3638,Congratulations U can claim 2 VIP row A Ticket...,spam,2928
1964,You only hate me. You can call any but you did...,ham,2561


In [None]:
# vectorize the data -- max vocab
cv = CountVectorizer(max_features=500)
cv.fit(ds_train.message)

CountVectorizer(max_features=500)

In [None]:
# reduce
dtm = cv.transform(ds_train.message).toarray()
pca = PCA(50)
pcs = pca.fit_transform(dtm)

In [None]:
# how much of the variance did we retain?
pca.explained_variance_ratio_.sum()

0.5425849044964769

In [None]:
# fit the tree to the training data
tree = DecisionTreeClassifier(max_depth=5, min_samples_split=30, min_samples_leaf=15, random_state=820)
tree.fit(pcs, ds_train.label)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=15, min_samples_split=30,
                       random_state=820)

In [None]:
# how well did it do on the TRAIN set
tree.score(pcs, ds_train.label)

0.9546284224250325

In [None]:
# apply the model to the test set
test_vs = pca.transform(cv.transform(ds_test.message).toarray())
test_preds = tree.predict(test_vs)
test_preds[:5]

array(['ham', 'spam', 'ham', 'ham', 'ham'], dtype=object)

In [None]:
# build out a dataset for the submission
ds_test['label'] = test_preds
ds_test.sample(3)

Unnamed: 0,message,id,label
147,Just seeing your missed call my dear brother. ...,1430,spam
131,If u laugh really loud.. If u talk spontaneous...,1246,ham
66,Ok i msg u b4 i leave my house.,599,ham


In [None]:
# write out the sample set
ds_test[['id', 'label']].to_csv('myteam-submission.csv', index=False)

In [None]:
! head myteam-submission.csv

id,label
4,ham
5,spam
11,ham
19,ham
21,ham
52,ham
59,ham
70,spam
76,ham


# Lets serve this model!

In [None]:

# save the models to disk
import joblib

joblib.dump(pca, "pca.joblib")
joblib.dump(cv, "cv.joblib")
joblib.dump(tree, "tree.joblib")

['tree.joblib']

In [None]:
# predict spam
# confirm
S = "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
dtm = cv.transform([S]).toarray()  
pcs = pca.transform(dtm)
pred = tree.predict(pcs) 

In [None]:
# predict ham
H = "Lol your always so convincing."
dtm = cv.transform([H]).toarray()  
pcs = pca.transform(dtm)
pred = tree.predict(pcs) 

In [None]:
# download the streamlit file
! wget https://raw.githubusercontent.com/Btibert3/BA820-Fall-2021/main/apps/streamlit-example/app.py

--2021-11-12 19:18:27--  https://raw.githubusercontent.com/Btibert3/BA820-Fall-2021/main/apps/streamlit-example/app.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1612 (1.6K) [text/plain]
Saving to: ‘app.py’


2021-11-12 19:18:28 (24.0 MB/s) - ‘app.py’ saved [1612/1612]



In [None]:

# 

# Streamlit App

In [None]:
# installs and restart runtime
! pip install streamlit
! pip install pyngrok

In [None]:
# create the ngrok session
from pyngrok import ngrok 


ngrok.connect(8501)




<NgrokTunnel: "http://3016-35-231-55-146.ngrok.io" -> "http://localhost:8501">

In [None]:
# run the app
! streamlit run app.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://35.231.55.146:8501[0m
[0m
[34m  Stopping...[0m
