In [1]:
%matplotlib inline


# Text preprocessing

The following example shows how to fit a simple NLP problem with
*auto-sklearn*.

For an introduction to text preprocessing you can follow these links:
    1. https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
    2. https://machinelearningmastery.com/clean-text-machine-learning-python/


In [2]:
from pprint import pprint

import pandas as pd
import sklearn.metrics
from sklearn.datasets import fetch_20newsgroups

import autosklearn.classification

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Data Loading



In [3]:
cats = ["comp.sys.ibm.pc.hardware", "rec.sport.baseball"]
X_train, y_train = fetch_20newsgroups(
    subset="train",  # select train set
    shuffle=True,  # shuffle the data set for unbiased validation results
    random_state=42,  # set a random seed for reproducibility
    categories=cats,  # select only 2 out of 20 labels
    return_X_y=True,  # 20NG dataset consists of 2 columns X: the text data, y: the label
)  # load this two columns separately as numpy array

X_test, y_test = fetch_20newsgroups(
    subset="test",  # select test set for unbiased evaluation
    categories=cats,  # select only 2 out of 20 labels
    return_X_y=True,  # 20NG dataset consists of 2 columns X: the text data, y: the label
)  # load this two columns separately as numpy array

## Creating a pandas dataframe
Both categorical and text features are often strings. Python Pandas stores python stings
in the generic `object` type. Please ensure that the correct
[dtype](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) is applied to the correct
column.



In [4]:
# create a pandas dataframe for training labeling the "Text" column as sting
X_train = pd.DataFrame({"Text": pd.Series(X_train, dtype="string")})

# create a pandas dataframe for testing labeling the "Text" column as sting
X_test = pd.DataFrame({"Text": pd.Series(X_test, dtype="string")})

## Build and fit a classifier



In [6]:
# create an autosklearn Classifier or Regressor depending on your task at hand.
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=60,
    per_run_time_limit=30,
    memory_limit=16384,
)

automl.fit(X_train, y_train, dataset_name="20_Newsgroups")  # fit the automl model



AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      memory_limit=16384, per_run_time_limit=30,
                      time_left_for_this_task=60)

## View the models found by auto-sklearn



In [7]:
print(automl.leaderboard())

          rank  ensemble_weight                type      cost  duration
model_id                                                               
3            1             0.18                 mlp  0.020408  4.234085
9            2             0.14                 mlp  0.022959  4.664087
12           3             0.04                 mlp  0.030612  4.614324
2            4             0.14       random_forest  0.038265  4.366662
5            5             0.06                 mlp  0.038265  3.646028
7            6             0.02       random_forest  0.051020  3.803840
4            7             0.10         extra_trees  0.076531  3.967908
6            8             0.12                 qda  0.086735  3.527900
10           9             0.12  passive_aggressive  0.237245  3.558334
8           10             0.08          libsvm_svc  0.497449  3.621733


## Print the final ensemble constructed by auto-sklearn



In [8]:
pprint(automl.show_models(), indent=4)

{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x798a8fb5cf10>,
           'cost': 0.03826530612244894,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x798a8fc2e640>,
           'ensemble_weight': 0.14,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x798a8fb5ce20>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=10, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x798a8fba9fa0>,
           'cost': 0.020408163265306145,
           'data_preprocessor': <autoskle

## Get the Score of the final ensemble



In [9]:
predictions = automl.predict(X_test)
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))

Accuracy score: 0.982256020278834
