In [0]:
# !sudo apt-get install build-essential swig
# !curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
# !pip install auto-sklearn

In [0]:
import autosklearn.classification
import sklearn.datasets
import sklearn.metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras.utils.np_utils import to_categorical
import numpy as np
import pandas as pd
import warnings
from keras.preprocessing.text import Tokenizer


In [0]:
VALIDATION_SPLIT = 0.2
max_words = 10000  # We will only consider the top 10,000 words in the dataset
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000

In [0]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
data = pd.read_json("/content/drive/My Drive/INFO7374/Assignment3/Complete.json", orient='records')
data = data.apply(lambda x: x.astype(str).str.lower())
data['sentiment'] = data['sentiment'].replace('neutra;', 'neutral')
data['sentiment'] = data['sentiment'].replace('postive', 'positive')
print('dataset loaded with shape', data.shape)
print(data.head(5))

dataset loaded with shape (1325, 2)
     sentiment                                               text
0     positive  thank you, ellen. we have a strong 2018, with ...
1     positive  stock-based compensation totaled $2.3 billion....
10    negative  for the full-year 2018, other bets revenues we...
100    neutral  great. thank you. i just wanted to follow up o...
1000   neutral                                           operator


In [0]:
X_raw = data["text"]
Y_raw = data["sentiment"]

In [0]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
tokenizer.fit_on_texts(X_raw)
sequences = tokenizer.texts_to_sequences(X_raw)
word_index = tokenizer.word_index
X_processed = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

encoder = LabelEncoder()
encoder.fit(Y_raw)
Y_processed = encoder.transform(Y_raw)
Y_processed = to_categorical(np.asarray(Y_processed), 3)

print('Found %s unique tokens.' % len(word_index))

Found 5562 unique tokens.


In [0]:
indices = np.arange(X_processed.shape[0])
X_processed = X_processed[indices]
Y_processed = Y_processed[indices]
nb_validation_samples = int(VALIDATION_SPLIT * X_processed.shape[0])
x_train = X_processed[:-nb_validation_samples]
y_train = Y_processed[:-nb_validation_samples]
x_val = X_processed[-nb_validation_samples:]
y_val = Y_processed[-nb_validation_samples:]

x_train, x_test, y_train, y_test = train_test_split(np.array(X_processed),
                                                    np.array(Y_processed), test_size=0.2)

In [0]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=120)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    automl.fit(x_train, y_train)

y_hat_train = automl.predict(x_train)
y_hat_test = automl.predict(x_test)

Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (119.751327)


In [0]:
print("accuracy on training set: %f" % sklearn.metrics.accuracy_score(y_train, y_hat_train))
print("accuracy on test set: %f" % sklearn.metrics.accuracy_score(y_test, y_hat_test))

accuracy on training set: 0.800000
accuracy on test set: 0.441509


In [0]:
automl.cv_results_['params'][np.argmax(automl.cv_results_['mean_test_score'])]

{'balancing:strategy': 'none',
 'categorical_encoding:__choice__': 'one_hot_encoding',
 'categorical_encoding:one_hot_encoding:minimum_fraction': 0.01,
 'categorical_encoding:one_hot_encoding:use_minimum_fraction': 'True',
 'classifier:__choice__': 'random_forest',
 'classifier:random_forest:bootstrap': 'True',
 'classifier:random_forest:criterion': 'gini',
 'classifier:random_forest:max_depth': 'None',
 'classifier:random_forest:max_features': 0.5,
 'classifier:random_forest:max_leaf_nodes': 'None',
 'classifier:random_forest:min_impurity_decrease': 0.0,
 'classifier:random_forest:min_samples_leaf': 1,
 'classifier:random_forest:min_samples_split': 2,
 'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 'classifier:random_forest:n_estimators': 100,
 'imputation:strategy': 'mean',
 'preprocessor:__choice__': 'no_preprocessing',
 'rescaling:__choice__': 'standardize'}

In [0]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: a2e530f2e2be706da1be92511edc7952
  Metric: f1_macro
  Best validation score: 0.418096
  Number of target algorithm runs: 14
  Number of successful target algorithm runs: 11
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 2

