In [168]:
#!/usr/bin/python3

# Webserver dependencies
from http.server import BaseHTTPRequestHandler, HTTPServer
import argparse
import time
import google.cloud.storage as storage
import google.cloud.pubsub as pubsub

# Sql dependencies
import os
from google.cloud.sql.connector import Connector, IPTypes
import pymysql
import socket, struct
import sqlalchemy

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [169]:
PROJECT_ID = "feisty-gasket-398719"
TOPIC_ID = "my-topic"
SUBSCRIPTION_NAME = "my-topic-sub"
INSTANCE_CONNECTION_NAME = "feisty-gasket-398719:us-east1:instance-tigeryi"
DB_USER = "root"
DB_PASS = ""
DB_NAME = "dbhw5"
DB_PRIVATE_IP = False
BANNED_COUNTRIES = ["North Korea", "Iran", "Cuba", "Myanmar", "Iraq", "Libya", "Sudan", "Zimbabwe", "Syria"]
HTTP_METHODS = ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']

In [170]:
def connect_with_connector() -> sqlalchemy.engine.base.Engine:
        """
        Initializes a connection pool for a Cloud SQL instance of MySQL.
        
        Uses the Cloud SQL Python Connector package.
        """
        # Note: Saving credentials in environment variables is convenient, but not
        # secure - consider a more secure solution such as
        # Cloud Secret Manager (https://cloud.google.com/secret-manager) to help
        # keep secrets safe.

        instance_connection_name = INSTANCE_CONNECTION_NAME # os.environ["INSTANCE_CONNECTION_NAME"]  # e.g. 'project:region:instance'
        db_user = DB_USER # os.environ["DB_USER"]  # e.g. 'my-db-user'
        db_pass = DB_PASS # os.environ["DB_PASS"]  # e.g. 'my-db-password'
        db_name = DB_NAME # os.environ["DB_NAME"]  # e.g. 'my-database'

        ip_type = IPTypes.PRIVATE if DB_PRIVATE_IP else IPTypes.PUBLIC # if os.environ.get("DB_PRIVATE_IP")

        connector = Connector(ip_type)

        def getconn() -> pymysql.connections.Connection:
            conn: pymysql.connections.Connection = connector.connect(
                instance_connection_name,
                "pymysql",
                user=db_user,
                password=db_pass,
                db=db_name,
            )
            return conn
        
        pool = sqlalchemy.create_engine(
            "mysql+pymysql://",
            creator=getconn,
            # ...
        )
        return pool

In [183]:
pool = connect_with_connector()

In [184]:
query = '''select ip2, country2, gender2, age2, income2, ip, country, gender, age, income from table2;'''


In [185]:
df = pd.read_sql_query(query, pool)

In [186]:
print(df.head())

          ip2  country2  gender2  age2  income2              ip      country  \
0  2737397699       160        1     0        6   163.41.95.195      Somalia   
1  2509268020        89        1     0        6  149.144.100.52       Kosovo   
2  1314065103       113        0     1        1    78.83.10.207       Monaco   
3  2436094690         9        0     7        0  145.51.218.226      Austria   
4   294390865       162        1     1        4    17.140.12.81  South Korea   

   gender    age     income  
0  Female   0-16  150k-250k  
1  Female   0-16  150k-250k  
2    Male  17-25    10k-20k  
3    Male    76+      0-10k  
4  Female  17-25   60k-100k  


In [187]:
len(df)

73083

In [188]:
X = df[['ip2','country2', 'gender2', 'age2']]
X

Unnamed: 0,ip2,country2,gender2,age2
0,2737397699,160,1,0
1,2509268020,89,1,0
2,1314065103,113,0,1
3,2436094690,9,0,7
4,294390865,162,1,1
...,...,...,...,...
73078,698421238,26,1,6
73079,2414169747,2,0,5
73080,2382936819,112,1,3
73081,803091905,25,0,5


In [189]:
y = df['income2']
y

0        6
1        6
2        1
3        0
4        4
        ..
73078    6
73079    7
73080    7
73081    1
73082    1
Name: income2, Length: 73083, dtype: int64

In [190]:
clf = RandomForestClassifier(
    n_estimators=200, 
    criterion="gini", 
    random_state=0, 
    min_samples_split=2, 
    min_samples_leaf=1
) 

# max_depth=100

In [191]:
clf.fit(X,y)

In [192]:
clf.score(X,y)

0.7094673179809258

In [193]:
y_pred = clf.predict(X)
y_pred

array([6, 6, 1, ..., 3, 1, 1])

In [194]:
accuracy_score(y, y_pred)

0.7094673179809258