In [2]:
# Author : Yashesh Savani
# Date Created: 26th July, 2020
# Serverless Project: Machine Learning Analysis 1
# Reference:
# “K-Means Clustering with scikit-learn,” Jonathan Soma makes things. [Online]. 
# Available: http://jonathansoma.com/lede/algorithms-2017/classes/clustering/k-means-clustering-with-scikit-learn/. [Accessed: 26-Jul-2020].

In [108]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import pickle
import bs4
import re
from google.cloud import storage

In [96]:
os.environ["SRC_BUCKET"] = "newsreutersfiles"
os.environ["DEST_MODEL_BUCKET"] = "kmeansmodels"
os.environ["NUM_CLUSTERS"] = "4"

In [97]:
src_bucket = os.getenv("SRC_BUCKET")
dest_model_bucket = os.getenv("DEST_MODEL_BUCKET")
num_clusters = int(os.getenv("NUM_CLUSTERS"))

In [98]:
TITLE_FILE_NAME = "titles.txt"
KMEANS_MODEL_PKL = "kmeans_model.pkl"
FEATURES_PKL = "feature.pkl"

In [99]:
# Extract Titles from the files
def extract_titles_from_files():
    client_storage = storage.Client()
    # Get object of the bucket where files will be uploaded
    src_bucket_files_blob = client_storage.list_blobs(src_bucket)    # Get source bucket object
    
    # Get source bucket object
    src_bucket_object = client_storage.get_bucket(src_bucket)
    
    # Loop through all the file objects got from GCP storage
    for file in src_bucket_files_blob:
        file_blob = src_bucket_object.get_blob(file.name)
        file_data = file_blob.download_as_string()
        soup = bs4.BeautifulSoup(file_data, "html.parser")
        titles_in_file = soup.find_all("title")
        with open(TITLE_FILE_NAME, "a", encoding="utf-8", errors="ignore") as f:
            for title in titles_in_file:
                title_text = re.sub(r"<[-.'/()\s\w]*[<>)]", "", title.text).strip()
                f.write(title_text.lower() + "\n")

In [100]:
extract_titles_from_files()

In [119]:
# Create model of word vector got from CountVectorizer
def create_model(FILE_NAME):
    with open(FILE_NAME, encoding="utf-8", errors="ignore") as f:
        file_data = f.read().split("\n")

    title_word_vector = CountVectorizer(stop_words='english')
    word_matrix = title_word_vector.fit_transform(file_data)
    df = pd.DataFrame(word_matrix.toarray(), columns=title_word_vector.get_feature_names())
    pickle.dump(title_word_vector.vocabulary_, open(FEATURES_PKL, "wb"))
    return word_matrix

In [120]:
# Create feature.pkl from titles gathered from training data.
word_matrix = create_model(TITLE_FILE_NAME)

In [133]:
# Train the model using training titles data
def kmeans_clustering(word_matrix, FILE_NAME):
    
    with open(FILE_NAME, encoding="utf-8", errors="ignore") as f:
        file_data = f.read().split("\n")
    kmeans_m = KMeans(n_clusters=num_clusters)
    kmeans_m.fit(word_matrix)
    labels = kmeans_m.labels_
    df = pd.DataFrame()
    df["title"] = file_data
    df["cluster_number"] = labels
    
    # Initialise storage object
    client_storage = storage.Client()
    # Get destination bucket object
    dest_bucket_object = client_storage.get_bucket(dest_model_bucket)
    # Upload kmeansmodel.pkl to GCP storage 
    dest_bucket_blob = dest_bucket_object.blob("Title_cluster.csv") 
    df.to_csv("Title_cluster.csv", index=False)
    dest_bucket_blob.upload_from_filename("Title_cluster.csv")
    pickle.dump(kmeans_m, open(KMEANS_MODEL_PKL, "wb"))
    return kmeans_m

In [134]:
kmeans_m = kmeans_clustering(word_matrix,TITLE_FILE_NAME)

In [113]:
# Upload trained model to bucket for further use
def upload_models_to_bucket():
    
    # Initialise storage object
    client_storage = storage.Client()
    # Get destination bucket object
    dest_bucket_object = client_storage.get_bucket(dest_model_bucket)
    
    # Upload kmeansmodel.pkl to GCP storage 
    dest_bucket_blob = dest_bucket_object.blob(KMEANS_MODEL_PKL) 
    with open(KMEANS_MODEL_PKL, "rb") as f:
        dest_bucket_blob.upload_from_file(f)

    # Upload feature.pkl to GCP storage 
    dest_bucket_blob = dest_bucket_object.blob(FEATURES_PKL) 
    with open(FEATURES_PKL, "rb") as f:
        dest_bucket_blob.upload_from_file(f)


In [114]:
upload_models_to_bucket()

In [136]:
def test_kmeans(MODEL_PATH, test_sentence, FEATURE_PATH):
 
    kmeans_model = pickle.load(open(MODEL_PATH, "rb"))
    vocab = pickle.load(open(FEATURE_PATH, "rb"))
    title_word_vector = CountVectorizer(vocabulary=vocab)
    matrix = title_word_vector.transform([test_sentence])
    
    pred = kmeans_model.predict(matrix)
    print(pred)

In [137]:
test_kmeans("kmeans_model.pkl", "BAHIA COCOA REVIEW","feature.pkl")

[0]
