# Environment

## Libraries

In [135]:
import os, subprocess, requests, getpass, urllib.parse, sys, pathlib, string, spacy, bs4, \
numpy as np, seaborn as sns, pandas as pd, matplotlib.pyplot as plt, google.cloud.bigquery as bigquery, \
tensorflow as rf, bert as bc

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV

## Specifying the Working Directory

In [2]:
workingdir=os.getcwd()
d=[d for d in os.listdir(workingdir)]
n=0
while not set(['notebook']).issubset(set(d)):
    workingdir=str(pathlib.Path(workingdir).parents[0])

    d=[d for d in os.listdir(str(workingdir))]
    n+=1
    if n>5:
        break
sys.path.insert(0, workingdir)
os.chdir(workingdir)

## Dealing with Warnings

In [3]:
pd.set_option('mode.chained_assignment', None)

## Function Definitions

In [4]:
def decode_html(input_str: str, body: str = '') -> str:
    soup = bs4.BeautifulSoup(input_str, 'html.parser')
    
    if len(body) == 0:
        output = soup.text
        return output
    
    html_elements = soup.find_all(body)
    output = ' '.join(html_elements)
    return output

In [5]:
def nlp(input_list: list, spacy_obj) -> list:
    doc = spacy_obj(input_list)
    stopwords = list(string.punctuation + string.digits) + ['-pron-']
    output = [token.lemma_.lower() for token in doc if not token.is_stop and token.lemma_.lower() not in stopwords]
    return output

In [6]:
def tokenizer(input_list: list) -> list:
    spacy_obj = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    doc = spacy_obj(input_list)
    stopwords = list(string.punctuation + string.digits) + ['-pron-']
    output = [token.lemma_.lower() for token in doc if not token.is_stop and token.lemma_.lower() not in stopwords]
    return output

In [7]:
def preprocess(input_str: str) -> list:
        spacy_object = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        
        try:
            step_1 = decode_html(input_str)
            step_2 = nlp(step_1, spacy_object)
            return step_2
        except TypeError:
            return ''

In [8]:
def split_tags(tags: str) -> list:
    return tags.split('|')

In [9]:
def create_label(tags, mapping):
    for tag in tags:
        if tag in mapping.keys():
            return mapping[tag]
        else:
            return 0

## Specifying GCP-Related Variables 

In [10]:
os.environ['PROJECT_ID'] = subprocess.run('gcloud config list project --format "value(core.project)"', 
                                          shell=True, check=True, stdout=subprocess.PIPE) \
                                            .stdout.decode().replace('\n', '').replace('\r', '')
print(os.environ['PROJECT_ID'])

axa-ch-machine-learning-dev


In [11]:
os.environ['BUCKET_NAME']='axa-ch-machine-learning-poc-dev'

In [12]:
try:
    tmp=os.environ['PROJECT_ID']
except:
    print('Env variable PROJECT not defined!') 

try:
    tmp=os.environ['BUCKET_NAME']
except:
    print('Env variable BUCKET_NAME not defined!') 
    
try:    
    tmp=os.environ['GOOGLE_APPLICATION_CREDENTIALS']
except:
    print('Env variable GOOGLE_APPLICATION_CREDENTIALS not defined!') 

try:
    tmp=os.environ['REQUESTS_CA_BUNDLE']
except:
    print('Env variable REQUESTS_CA_BUNDLE not defined!') 

try:
    tmp=os.environ['AXA_CH_CA_BUNDLE']
except:
    print('Env variable AXA_CA_CA_BUNDLE not defined!') 

## Does the Connection Work?

In [13]:
use_proxy='Y'
proxies = {
    'https': os.environ['HTTPS_PROXY'],    
    'http': os.environ['HTTP_PROXY']
}  

In [14]:
list_url=['https://www.google.com',
          'http://www.google.com',
          'https://www.example.com',
          'http://www.example.com',
          'https://github.com/j0hannes/cutter-ng']

for url in list_url:
    print('')
    print('trying to access:'+url)
    try:
        if use_proxy=='N':
            r = requests.get(url)
        else:
            # SSL deactivated
            #r = requests.get(url,proxies=proxies,verify=False)
            r = requests.get(url,proxies=proxies,verify=True)
            
        if r.status_code == requests.codes.ok:
            print('=>OK',r.headers['content-type'])
        else:
            # 407 Proxy Authentication Required
            print ('=> ??', r.status_code)
    except Exception as inst:
        print('=>FAILED')
        print(type(inst))    # the exception instance
        print(inst.args)     # arguments stored in .args
        print(inst)


trying to access:https://www.google.com
=>OK text/html; charset=ISO-8859-1

trying to access:http://www.google.com
=>OK text/html; charset=ISO-8859-1

trying to access:https://www.example.com
=>OK text/html; charset=UTF-8

trying to access:http://www.example.com
=>OK text/html; charset=UTF-8

trying to access:https://github.com/j0hannes/cutter-ng
=>OK text/html; charset=utf-8


# Preprocessing

In [27]:
query = """SELECT
 *
FROM
  `nlp_text_classification.stackoverflow_posts_complete`
WHERE
  tags <> ''
"""

In [28]:
client = bigquery.Client()
df = client.query(query).to_dataframe()

In [29]:
df['tags_new'] = df['tags'].apply(split_tags)

In [76]:
def make_subset(df: pd.DataFrame) -> pd.DataFrame:
    def classify(tags: list, subset: list) -> int:
        for tag in tags:
            if tag not in subset:
                return 0
        return 1
    
    subset = ['javascript', 'java', 'c#', 'php', 'python', 'android', 'jquery', 'html', 'c++', 'ios']
    df['keep'] = df['tags_new'].apply(lambda tags: classify(tags, subset))
    
    return df[df['keep'] == 1][['id', 'title', 'body', 'tags', 'tags_new']]

In [77]:
df_subset = make_subset(df)
df_subset['body_new'] = df_subset['body'].apply(preprocess)

# TF-IDF and Random Forest

In [112]:
x = df_subset['body_new'].apply(lambda array: ' '.join(array))
y = MultiLabelBinarizer().fit_transform(df_subset['tags_new'])

In [113]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [114]:
estimators = [('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(RandomForestClassifier()))]
parameters = {'tfidf__max_features': [1000, 2000], 
              'clf__estimator__n_estimators': [100],
             'tfidf__ngram_range': [(1, 1), (2, 2), (3, 3)]}
p = Pipeline(estimators)

In [115]:
grid = GridSearchCV(p, param_grid=parameters, cv=5, return_train_score=True)

In [116]:
score = grid.fit(x_train, y_train);



In [117]:
y_pred = grid.predict(x_test)

In [118]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.92      0.57      0.71        21
           1       0.00      0.00      0.00         7
           2       1.00      0.15      0.27        13
           3       0.00      0.00      0.00        16
           4       0.00      0.00      0.00         3
           5       0.83      0.31      0.45        16
           6       0.76      0.82      0.79        34
           7       0.88      0.61      0.72        23
           8       0.91      0.56      0.69        18
           9       1.00      0.36      0.53        11

   micro avg       0.84      0.46      0.60       162
   macro avg       0.63      0.34      0.42       162
weighted avg       0.73      0.46      0.54       162
 samples avg       0.50      0.47      0.47       162

0.3488372093023256


The model has a lot of problems that must be addressed:
<ul>
    <li>Too few samples</li>
    <li>Class imbalance when spliting into training and evaluation dasets</li>
    <li>Hyperparameter tuning</li>
    <li>Feature selection (context is not considered) and weighting</li>
</ul>        
It is just a first evaluation.

# BERT

In [121]:
!pip install bert-tensorflow

Collecting bert-tensorflow
  Using cached https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1


In [123]:
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
BERT_VOCAB = 'data/uncased-l12-h768-a12/vocab.txt'
BERT_INIT_CHKPNT = 'data/uncased-l12-h768-a12/bert_model.ckpt'
BERT_CONFIG = 'data/uncased-l12-h768-a12/bert_config.json'

## Tokenization

In [136]:
vecs = bc.encode(texts2, is_tokenized=True)

AttributeError: module 'bert' has no attribute 'encode'