https://www.kaggle.com/yufengg/automl-getting-started-notebook

In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
from datetime import datetime

from sklearn.model_selection import train_test_split

from google.cloud import storage
from google.cloud import automl_v1beta1 as automl

# workaround to fix gapic_v1 error
from google.api_core.gapic_v1.client_info import ClientInfo

from automlwrapper import AutoMLWrapper


In [3]:
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file("<PATH TO YOUR CREDENTIALS FILE>")

In [9]:
# Set your own values for these. bucket_name should be the project_id + '-lcm'.
PROJECT_ID = '<PROJECT ID>'
bucket_name = '<BUCKET NAME>'

region = 'us-central1' # Region must be us-central1
dataset_display_name = 'kaggle_tweets'
model_display_name = 'kaggle_starter_model1'

storage_client = storage.Client(project=PROJECT_ID, credentials=credentials)

# adding ClientInfo here to get the gapic_v1 call in place
client = automl.AutoMlClient(client_info=ClientInfo(), credentials=credentials)

print(f'Starting AutoML notebook at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')


Starting AutoML notebook at 2020-02-27, 06:56:32 UTC


In [6]:
nlp_train_df = pd.read_csv('data/train.csv')
nlp_test_df = pd.read_csv('data/test.csv')
def callback(operation_future):
    result = operation_future.result()

In [7]:
nlp_train_df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [8]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket. https://cloud.google.com/storage/docs/ """
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}'.format(
        source_file_name,
        'gs://' + bucket_name + '/' + destination_blob_name))
    
def download_to_kaggle(bucket_name,destination_directory,file_name,prefix=None):
    """Takes the data from your GCS Bucket and puts it into the working directory of your Kaggle notebook"""
    os.makedirs(destination_directory, exist_ok = True)
    full_file_path = os.path.join(destination_directory, file_name)
    blobs = storage_client.list_blobs(bucket_name,prefix=prefix)
    for blob in blobs:
        blob.download_to_filename(full_file_path)

In [10]:
bucket = storage.Bucket(storage_client, name=bucket_name)
if not bucket.exists():
    bucket.create(location=region)

In [11]:
# Select the text body and the target value, for sending to AutoML NL
nlp_train_df[['text','target']].to_csv('gcs/train.csv', index=False, header=False) 

In [12]:
training_gcs_path = 'uploads/kaggle_getstarted/full_train.csv'
upload_blob(bucket_name, 'gcs/train.csv', training_gcs_path)

File gcs/train.csv uploaded to gs://nlp-getting-started-269502-lcm/uploads/kaggle_getstarted/full_train.csv


In [13]:
amw = AutoMLWrapper(client=client, 
                    project_id=PROJECT_ID, 
                    bucket_name=bucket_name, 
                    region='us-central1', 
                    dataset_display_name=dataset_display_name, 
                    model_display_name=model_display_name)


In [14]:
print(f'Getting dataset ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
if not amw.get_dataset_by_display_name(dataset_display_name):
    print('dataset not found')
    amw.create_dataset()
    amw.import_gcs_data(training_gcs_path)

amw.dataset
print(f'Dataset ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')



Getting dataset ready at 2020-02-27, 07:00:29 UTC
searching for dataset named: kaggle_tweets
no matching datasets found
dataset not found
making new dataset, display name: kaggle_tweets
name: "projects/895214144267/locations/us-central1/datasets/TCN5514382865808228352"
display_name: "kaggle_tweets"
create_time {
  seconds: 1582776032
  nanos: 185165000
}
etag: "AB3BwFr0KXS0DdJFg6UYhQuOlU3_GG-J3MOSe-LugRLOZyicoQQejzXqTC4GWy6WGvw="
text_classification_dataset_metadata {
  classification_type: MULTICLASS
}

importing csv data. This may take a moment
<google.api_core.operation.Operation object at 0x7f1af97db750>

Dataset ready at 2020-02-27, 07:29:51 UTC


In [None]:
print(f'Getting model trained at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

if not amw.get_model_by_display_name():
    print(f'Training model at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
    amw.train_model()

print(f'Model trained. Ensuring model is deployed at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
amw.deploy_model()
amw.model
print(f'Model trained and deployed at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

Getting model trained at 2020-02-27, 07:29:51 UTC
searching for model named: kaggle_starter_model1
no matching models found
Training model at 2020-02-27, 07:29:52 UTC
making new model with dataset TCN5514382865808228352, named kaggle_starter_model1
creating and training model
<google.api_core.operation.Operation object at 0x7f1afc070690>


In [16]:
nlp_test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [17]:
amw.model_full_path

'projects/895214144267/locations/us-central1/models/TCN1916752232422834176'

In [19]:
print(f'Begin getting predictions at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

# Create client for prediction service.
prediction_client = automl.PredictionServiceClient(credentials=credentials)
amw.set_prediction_client(prediction_client)

predictions_df = amw.get_predictions(nlp_test_df, 
                                     input_col_name='text', 
#                                      ground_truth_col_name='target', # we don't have ground truth in our test set
                                     limit=None, 
                                     threshold=0.5,
                                     verbose=False)

print(f'Finished getting predictions at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

Begin getting predictions at 2020-02-27, 12:06:34 UTC
Finished getting predictions at 2020-02-27, 12:40:06 UTC


In [20]:
predictions_df.head()

Unnamed: 0,score,class,text
0,0.588381,1,Just happened a terrible car crash
1,0.602664,1,"Heard about #earthquake is different cities, s..."
2,0.820415,1,"there is a forest fire at spot pond, geese are..."
3,0.539053,1,Apocalypse lighting. #Spokane #wildfires
4,0.906334,1,Typhoon Soudelor kills 28 in China and Taiwan


In [21]:
submission_df = pd.concat([nlp_test_df['id'], predictions_df['class']], axis=1)
submission_df.head()

Unnamed: 0,id,class
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [22]:
submission_df = submission_df.rename(columns={'class':'target'})
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [23]:
submission_df.to_csv("submissions/03_google_automl.csv", index=False, header=True)
