**AutoML Assignment:**

*Problem statement*: **Machine Learning Pipeline Automation**

Build an accelerator to automate all the steps in ML model development

# Write an Automated ML function to be called using any data frame (dataset) to give a good trained model.

In [None]:
#Write an AutoML function.
def automl(**kwargs):
  from sklearn import datasets
  import numpy as np
  import pandas as pd
  import matplotlib.pyplot as plt
  import seaborn as sns
  import requests
  from bs4 import BeautifulSoup
  import geopandas as gpd
  from prettytable import PrettyTable
  from autokeras import StructuredDataClassifier
  from sklearn.model_selection import train_test_split

  #Define a list of URLs for Web Scraping.
  url_list = ["https://www.kaggle.com/datasets?datasetsOnly=true", "https://public.knoema.com/", "https://www.kaggle.com/c/nfl-health-and-safety-helmet-assignment/data", "http://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets/", "https://www.kaggle.com/mysarahmadbhat/bmw-used-car-listing", "https://www.climate.gov/maps-data/datasets"]
  for url in url_list:
    #Make a GET request to fetch the raw HTML content.
    web_content = requests.get(url).content
    #Parse the html content.
    soup = BeautifulSoup(web_content, "html.parser")
    #Remove any newlines and extra spaces from left and right.
    extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
    #Find all table rows and data cells within.
    stats = [] 
    all_rows = soup.find_all('tr')
    for row in all_rows:
        stat = extract_contents(row.find_all('td')) 
    #Notice that the data that we require is now a list of length 5.
        if len(stat) == 5:
            stats.append(stat)
    #Now convert the data into a pandas dataframe for further processing.
    new_cols = []
    for each_new_col in row:
      kaggle_data = pd.DataFrame(data = kaggle_data, columns = each_new_col)
      kaggle_data.head()
      #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
      kaggle_data[each_new_col] = state_data[each_new_col].map(int)

    from mlbox.optimisation import Optimiser, Regressor
    
    #Evaluate the pipeline.
    opt = Optimiser()
    params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}
    df = {"train" : pd.DataFrame(train_data.iloc[:,:-1]), "target" : pd.Series(test_data.iloc[:,-1])}

    #Build a keras model.
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    model = keras.Sequential()
    #Relu: Rectified Linear Unit.
    #Adds a densely-connected layer with 64 units to the model.
    model.add(keras.layers.Dense(64, activation='relu'))
    #Add another.
    model.add(keras.layers.Dense(64, activation='relu'))
    #Add a softmax layer with 10 output units.
    model.add(keras.layers.Dense(10, activation='softmax'))
    #Define a ConvModel.
    class ConvModel(tf.keras.Model):
        def __init__(self, nfs, input_shape, output_shape, use_bn=False, use_dp=False):
            super(ConvModel, self).__init__(name='mlp')
            self.use_bn = use_bn
            self.use_dp = use_dp
            self.num_classes = num_classes

            #Backbone layers
            self.convs = [ConvLayer(nfs[0], s=1, input_shape=input_shape)]
            self.convs += [ConvLayer(nf) for nf in nfs[1:]]
            #Classification layers
            self.convs.append(AveragePooling2D())
            self.convs.append(Dense(output_shape, activation='softmax'))

        def call(self, inputs):
            for layer in self.convs: inputs = layer(inputs)
            return inputs
    #Compile the model.
    model.compile(loss='categorical crossentropy', metrics=['accuracy'], optimizer='rmsprop')
    model.build((None, 32, 32, 3))

    model.summary()

    import requests
    from bs4 import BeautifulSoup
    #Import H2O GBM.
    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    #Make a GET request to fetch the raw HTML content.
    web_content = requests.get(url).content
    #Parse the html content.
    soup = BeautifulSoup(web_content, "html.parser")
    #Remove any newlines and extra spaces from left and right.
    extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
    #Find all table rows and data cells within.
    stats = [] 
    all_rows = soup.find_all('tr')
    for row in all_rows:
      stat = extract_contents(row.find_all('td')) 
      # Notice that the data that we require is now a list of length 5.
      if len(stat) == 5:
        stats.append(stat)
      #Now convert the data into a pandas dataframe for further processing.
      new_cols = []
      for each_new_col in row:
        stats_data = pd.DataFrame(data = stats, columns = each_new_col)
        stats_data.head()
        #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
        kaggle_data[each_new_col] = stats_data[each_new_col].map(int)
        X, y = stats_data
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=11)
        
        history = model.fit(x_train, y_train,
                        batch_size=64,
                        epochs=1000)

        model.summary()
        input_shape = (2, 3, 4)
        x1 = tf.random.normal(input_shape)
        x2 = tf.random.normal(input_shape)
        y = tf.keras.layers.Add()([x1, x2])
        print(y.shape)

        tf.keras.layers.LSTM(3, activation='tanh', recurrent_activation='sigmoid',
            use_bias=True, kernel_initializer='glorot_uniform',
            recurrent_initializer='orthogonal',
            bias_initializer='zeros', unit_forget_bias=True, dropout=0.0, recurrent_dropout=0.0,
            return_sequences=False, return_state=False, go_backwards=False, stateful=False,
            time_major=False, unroll=False)

        #Define a ConvLayer.
        class ConvLayer(Layer) :
            def __init__(self, nf, ks=3, s=2, **kwargs):
                self.nf = nf
                self.grelu = GeneralReLU(leak=0.01)
                self.conv = (Conv2D(filters     = nf,
                                    kernel_size = ks,
                                    strides     = s,
                                    padding     = "same",
                                    use_bias    = False,
                                    activation  = "linear"))
                super(ConvLayer, self).__init__(**kwargs)

            def rsub(self): return -self.grelu.sub
            def set_sub(self, v): self.grelu.sub = -v
            def conv_weights(self): return self.conv.weight[0]

            def build(self, input_shape):
                # No weight to train.
                super(ConvLayer, self).build(input_shape)  # Be sure to call this at the end

            def compute_output_shape(self, input_shape):
                output_shape = (input_shape[0],
                                input_shape[1]/2,
                                input_shape[2]/2,
                                self.nf)
                return output_shape

        def call(self, x):
            return self.grelu(self.conv(x))

        def __repr__(self):
            return f'ConvLayer(nf={self.nf}, activation={self.grelu})'

    opt.evaluate(params, df)

    datasets_dict = {
    "iris": datasets.load_iris(), 
    "boston": datasets.load_boston(),
    "breast_cancer": datasets.load_breast_cancer(),
    "diabetes": datasets.load_diabetes(),
    "wine": datasets.load_wine(),
    "linnerud": datasets.load_linnerud(),
    "digits": datasets.load_digits(),
    "kaggle_data_list": 
    pd.DataFrame({
    "Latest_Covid-19_India_Status":pd.read_csv("Latest Covid-19 India Status.csv", sep=','),
    "Pueblos_Magicos": pd.read_csv("pueblosMagicos.csv", sep=','),
    "Apple_iphone_SE_reviews&ratings": pd.read_csv("APPLE_iPhone_SE.csv", sep=',')
    })
                  }

    if len(datasets_dict["kaggle_data_list"])!=0:
      for i in range(len(datasets_dict.get("kaggle_data_list"))):
        df=df.iloc[:]
        print(df.head())
        print(df.tail())
        print(df.info())
        print(df.describe())

        from autoPyTorch import AutoNetClassification
        #Data and metric imports.
        import sklearn.model_selection
        import sklearn.metrics
        X, y = df.to_numpy()
        X_train, X_test, y_train, y_test = \
                sklearn.model_selection.train_test_split(X, y, random_state=1)

        #Run Auto-PyTorch.
        autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                            log_level='info',
                                            max_runtime=999999999**10000000,
                                            min_budget=30,
                                            max_budget=999999999*100000)
        #Fit.
        autoPyTorch.fit(X_train, y_train, validation_split=0.3)
        #Predict.
        y_pred = autoPyTorch.predict(X_test)
        #Get the accuracy score.
        print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))

    else:
      for each_dataset in datasets_dict:
        print(each_dataset," dataset:")
        print("Data: ",each_dataset.data)
        print("Target: ", each_dataset.target)
        print("Target names: ", each_dataset.target_names)
        print("Description: ", each_dataset.DESCR)
        #Shape
        print("Shape of the data: ", each_dataset.data.shape)
        print("Shape of the target: ",each_dataset.target.shape)
        #Type
        print("Type of the data: ", type(each_dataset.data.shape))
        print("Type of the data: ", type(each_dataset.target.shape))
        #Dimensions
        print("Number of dimensions of the data: ", each_dataset.data.ndim)
        print("Number of dimensions of the target: ",each_dataset.target.ndim)
        #Number of samples and features
        n_samples, n_features = each_dataset.data.shape
        print("Number of samples: ", n_samples)
        print("Number of features: ", n_features)
        #Keys
        print("Keys: ", each_dataset.keys())
        X, y = digits.data, digits.target
        #View the first and last 5 rows of the pandas dataframe.
        df=pd.DataFrame(X, columns=digits.feature_names)
        print(df.head())
        print(df.tail())
        #print(digits.data[0])

        #Visualize data on its principal components.
        #PCA: Principal Component Analysis
        from sklearn.decomposition import PCA
        import matplotlib.pyplot as plt

        pca = PCA(n_components=2)
        proj = pca.fit_transform(each_dataset.data)
        plt.scatter(proj[:,0], proj[:,1], c=each_dataset.target, cmap="Paired")
        plt.colorbar()

        #Gaussian Naive-Bayes classification:
        from sklearn.naive_bayes import GaussianNB
        from sklearn.model_selection import train_test_split

        #Split the dataset into training and validation sets.
        X_train, X_test, y_train, y_test = train_test_split(
            each_dataset.data, each_dataset.target)

        #Train the model.
        clf = GaussianNB()
        clf.fit(X_train, y_train)

        #Use the model to predict the labels of the test data.
        predicted = clf.predict(X_test)
        expected = y_test

        #Plot the prediction.
        fig = plt.figure(figsize=(6, 6))  # Figure size is in inches.
        fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

        #Plot the digits: each image is 8x8 pixels.
        for i in range(64):
            ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
            ax.imshow(X_test.reshape(-1, 8, 8)[i], cmap=plt.cm.binary,
                      interpolation='nearest')

            #Label the image with the target value.
            if predicted[i] == expected[i]:
                ax.text(0, 7, str(predicted[i]), color='green')
            else:
                ax.text(0, 7, str(predicted[i]), color='red')

        #Quantify performance.
        #Number of correct matches
        matches = (predicted == expected)
        print(matches.sum())
        #Total nunber of data points
        print(len(matches))
        #Ratio of correct predictions
        matches.sum() / float(len(matches))

        #Print the classification report.
        from sklearn import metrics
        print(metrics.classification_report(expected, predicted))
        #Obtain the confusion matrix.
        print(metrics.confusion_matrix(expected, predicted))
        plt.show() 

        #AutoGluon
        #Tabular prediction with AutoGluon:
        #Predict Columns in a Table.
        from autogluon.tabular import TabularDataset, TabularPredictor
        train_data = TabularDataset(each_dataset)
        subsample_size = 55500000  # subsample subset of data for faster demo
        train_data = train_data.sample(n=subsample_size, random_state=0)
        train_data.head()
        label = 'class'
        print("Summary of class variable: \n", train_data[label].describe())
        #Use AutoGluon to train multiple models.
        save_path = 'agModels-predictClass'  # Specifies folder to store trained models.
        predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
        test_data = TabularDataset(each_dataset)
        y_test = test_data[label]  # Values to predict.
        test_data_nolab = test_data.drop(columns=[label])  # Delete label column to prove we're not cheating.
        test_data_nolab.head()
        #Predict.
        y_pred = predictor.predict(test_data_nolab)
        print("Predictions:  \n", y_pred)
        perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
        predictor.leaderboard(test_data, silent=True)
        from autogluon.tabular import TabularPredictor
        predictor = TabularPredictor(label=label).fit(train_data=each_dataset)
        #.fit() returns a predictor object.
        pred_probs = predictor.predict_proba(test_data_nolab)
        pred_probs.head(5)
        #Summarize what happened during fit.
        results = predictor.fit_summary(show_plot=True)
        print("AutoGluon infers problem type is: ", predictor.problem_type)
        print("AutoGluon identified the following types of features:")
        print(predictor.feature_metadata)
        predictor.leaderboard(test_data, silent=True)
        predictor.predict(test_data, model='LightGBM')
        #Maximizing predictive performance.
        time_limit = 11  
        metric = 'roc_auc'  # Specify the evaluation metric here.
        predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality')
        predictor.leaderboard(test_data, silent=True)

        #Regression (predicting numeric table columns)
        column = 'column'
        print("Summary of PUEBLO variable: \n", train_data[column].describe())
        predictor_column = TabularPredictor(label=column, path="agModels-predictAge").fit(train_data, time_limit=60)
        performance = predictor_column.evaluate(test_data)
        #See the per-model performance.
        predictor_column.leaderboard(test_data, silent=True)
        

        #MLbox:
        from mlbox.optimisation import Optimiser
        from sklearn import datasets
        best = opt.optimise(space, df, 3)
        #Optimise the pipeline.
        opt = Optimiser()
        space = {
        'fs__strategy':{"search":"choice","space":["variance","rf_feature_importance"]},
        'est__colsample_bytree':{"search":"uniform", "space":[0.3,0.7]}
        }
        df = {"train" : pd.DataFrame(each_dataset.data), "target" : pd.Series(each_dataset.target)} 
        #Evaluate the pipeline.
        opt = Optimiser()
        params = {
        "ne__numerical_strategy" : 0,
        "ce__strategy" : "label_encoding",
        "fs__threshold" : 0.1,
        "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")],
        "est__strategy" : "Linear"
        }
        df = {"train" : pd.DataFrame(each_dataset.data), "target" : pd.Series(each_dataset.target)}
        opt.evaluate(params, df)


        #TPOT
        #Classification
        from tpot import TPOTClassifier
        from sklearn.model_selection import train_test_split
        #Perform a train test split.
        X_train, X_test, y_train, y_test = train_test_split(each_dataset.data, each_dataset.target, train_size=0.75, test_size=0.25)
        
        tpot=TPOTClassifier(generations=99, population_size=99, mutation_rate=0.7, crossover_rate=0.3, random_state=111, cv=5, subsample=0.98, verbosity=2, n_jobs=-2, max_eval_time_mins=0.00000001, config_dict='TPOT light', memory='áuto', log_file='tpot_datasets_logs')
        tpot.fit(X_train, y_train)
        print(tpot.score(X_test, y_test))
        tpot.export('tpot_datasets_pipeline.py')

        plt.hist(each_dataset.target)

        for index, feature_name in enumerate(each_dataset.feature_names):
          plt.figure()
          plt.scatter(each_dataset.data[:, index], each_dataset.target) 
          plt.show()

        from sklearn import model_selection
        X = each_dataset.data
        y = each_dataset.target

        X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                                                test_size=0.25, random_state=0)

        print("%r, %r, %r" % (X.shape, X_train.shape, X_test.shape))

        clf = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(metrics.confusion_matrix(y_test, y_pred))
        print(metrics.classification_report(y_test, y_pred))

        #Auto-Pytorch
        from autoPyTorch import AutoNetClassification

        #Data and metric imports
        import sklearn.model_selection
        import sklearn.datasets
        import sklearn.metrics
        X, y = each_dataset(return_X_y=True)
        X_train, X_test, y_train, y_test = \
                sklearn.model_selection.train_test_split(X, y, random_state=1)

        #Run Auto-PyTorch on the datasets.
        autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                            log_level='info',
                                            max_runtime=999999999**10000000,
                                            min_budget=30,
                                            max_budget=999999999*100000)
        autoPyTorch.fit(X_train, y_train, validation_split=0.3)
        y_pred = autoPyTorch.predict(X_test)

        print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))


        #Auto-Sklearn
        from sklearn import datasets
        import autosklearn.classification
        cls = autosklearn.classification.AutoSklearnClassifier()
        
        X, y = each_dataset(return_X_y=True)
        X_train, X_test, y_train, y_test = \
                sklearn.model_selection.train_test_split(X, y, random_state=1)

        cls.fit(X_train, y_train)
        predictions = cls.predict(X_test)

        import sklearn.model_selection
        import sklearn.metrics
        if __name__ == "__main__":
            X, y = each_dataset(return_X_y=True)
            X_train, X_test, y_train, y_test = \
                    sklearn.model_selection.train_test_split(X, y, random_state=1)
            automl = autosklearn.classification.AutoSklearnClassifier()

            automl.fit(X_train, y_train)
            y_hat = automl.predict(X_test)
            print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

        import numpy as np
        import tensorflow as tf
        import autokeras as ak
        input_node = ak.ImageInput()
        output_node = ak.Normalization()(input_node)
        output_node1 = ak.ConvBlock()(output_node)
        output_node2 = ak.ResNetBlock(version="v2")(output_node)
        output_node = ak.Merge()([output_node1, output_node2])
        output_node = ak.ClassificationHead()(output_node)

        auto_model = ak.AutoModel(inputs=input_node, outputs=output_node, overwrite=True, max_trials=100)
        #Prepare data to run the model.
        (x_train, y_train), (x_test, y_test) = each_datset
        print(x_train.shape)
        print(y_train.shape)
        print(y_train[:3])

        #Feed the AutoModel with training data.
        auto_model.fit(x_train[:100], y_train[:100], epochs=1000)
        #Predict with the best model.
        predicted_y = auto_model.predict(x_test)
        #Evaluate the best model with testing data.
        print(auto_model.evaluate(x_test, y_test))

        #Implement new block.
        class SingleDenseLayerBlock(ak.Block):
            def build(self, hp, inputs=None):
                #Get the input_node from inputs.
                input_node = tf.nest.flatten(inputs)[0]
                layer = tf.keras.layers.Dense(
                    hp.Int("num_units", min_value=32, max_value=512, step=32)
                )
                output_node = layer(input_node)
                return output_node

        #Build the AutoModel.
        input_node = ak.Input()
        output_node = SingleDenseLayerBlock()(input_node)
        output_node = ak.RegressionHead()(output_node)
        auto_model = ak.AutoModel(input_node, output_node, overwrite=True, max_trials=100)
        #Prepare the data.
        num_instances = 100
        x_train = np.random.rand(num_instances, 20).astype(np.float32)
        y_train = np.random.rand(num_instances, 1).astype(np.float32)
        x_test = np.random.rand(num_instances, 20).astype(np.float32)
        y_test = np.random.rand(num_instances, 1).astype(np.float32)
        #Train the model.
        auto_model.fit(x_train, y_train, epochs=1000)
        print(auto_model.evaluate(x_test, y_test))

  print(kwargs)
  #Return the trained model.
  return trained_model

In [2]:
!pip install autokeras
import tensorflow as tf
import autokeras as ak

Collecting autokeras
  Downloading autokeras-1.0.16-py3-none-any.whl (166 kB)
Collecting pandas
  Downloading pandas-1.3.3-cp39-cp39-win_amd64.whl (10.2 MB)
Collecting scikit-learn
  Downloading scikit_learn-1.0-cp39-cp39-win_amd64.whl (7.2 MB)
Collecting tensorflow<=2.5.0,>=2.3.0
  Downloading tensorflow-2.5.0-cp39-cp39-win_amd64.whl (422.6 MB)
Collecting keras-tuner>=1.0.2
  Downloading keras_tuner-1.0.4-py3-none-any.whl (97 kB)
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Collecting scipy
  Downloading scipy-1.7.1-cp39-cp39-win_amd64.whl (33.8 MB)
Collecting tensorboard
  Downloading tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
Collecting requests
  Downloading requests-2.26.0-py2.py3-none-any.whl (62 kB)
Collecting numpy
  Downloading numpy-1.21.2-cp39-cp39-win_amd64.whl (14.0 MB)
Collecting six~=1.15.0
  Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting protobuf>=3.9.2
  Downloading protobuf-3.18.0-cp39-cp39-win_amd64.whl (912 kB)
Collec

You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [3]:
!pip install scipy
!pip install sphinx
!pip install geopandas



You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Collecting sphinx
  Downloading Sphinx-4.2.0-py3-none-any.whl (3.1 MB)
Collecting sphinxcontrib-serializinghtml>=1.1.5
  Downloading sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl (94 kB)
Collecting alabaster<0.8,>=0.7
  Downloading alabaster-0.7.12-py2.py3-none-any.whl (14 kB)
Collecting imagesize


You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


  Downloading imagesize-1.2.0-py2.py3-none-any.whl (4.8 kB)
Collecting sphinxcontrib-applehelp
  Downloading sphinxcontrib_applehelp-1.0.2-py2.py3-none-any.whl (121 kB)
Collecting docutils<0.18,>=0.14
  Downloading docutils-0.17.1-py2.py3-none-any.whl (575 kB)
Collecting snowballstemmer>=1.1
  Downloading snowballstemmer-2.1.0-py2.py3-none-any.whl (93 kB)
Collecting babel>=1.3
  Downloading Babel-2.9.1-py2.py3-none-any.whl (8.8 MB)
Collecting sphinxcontrib-htmlhelp>=2.0.0
  Downloading sphinxcontrib_htmlhelp-2.0.0-py2.py3-none-any.whl (100 kB)
Collecting Jinja2>=2.3
  Downloading Jinja2-3.0.1-py3-none-any.whl (133 kB)
Collecting sphinxcontrib-jsmath
  Downloading sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl (5.1 kB)
Collecting sphinxcontrib-devhelp
  Downloading sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl (84 kB)
Collecting sphinxcontrib-qthelp
  Downloading sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl (90 kB)
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-2.0.1-cp39-cp

    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-exm9v534\\fiona_67efa17bf4f54103acb8941e9f3b4e8b\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-exm9v534\\fiona_67efa17bf4f54103acb8941e9f3b4e8b\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip-egg-info-p3slgb_u'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-exm9v534\fiona_67efa17bf4f54103acb8941e9f3b4e8b\

  Downloading geopandas-0.9.0-py2.py3-none-any.whl (994 kB)
Collecting fiona>=1.8
  Downloading Fiona-1.8.20.tar.gz (1.3 MB)
  Downloading Fiona-1.8.19.tar.gz (1.3 MB)
  Downloading Fiona-1.8.18.tar.gz (1.3 MB)
  Downloading Fiona-1.8.17.tar.gz (1.3 MB)
  Downloading Fiona-1.8.16.tar.gz (1.3 MB)
  Downloading Fiona-1.8.15.tar.gz (1.3 MB)
  Downloading Fiona-1.8.14.tar.gz (1.3 MB)
  Downloading Fiona-1.8.13.post1.tar.gz (1.2 MB)
  Downloading Fiona-1.8.13.tar.gz (1.2 MB)
  Downloading Fiona-1.8.12.tar.gz (1.2 MB)



    Complete output (1 lines):
    A GDAL API version must be specified. Provide a path to gdal-config using a GDAL_CONFIG environment variable or use a GDAL_VERSION environment variable.
    ----------------------------------------
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-exm9v534\\fiona_169385ee982f4f3596578d69559fa0e0\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-exm9v534\\fiona_169385ee982f4f3596578d69559fa0e0\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administr

In [4]:
!pip install deltalake #Installed but not used as of now.

  Downloading Fiona-1.8.11.tar.gz (1.2 MB)
  Downloading Fiona-1.8.10.tar.gz (1.2 MB)
  Downloading Fiona-1.8.9.post2.tar.gz (1.2 MB)
  Downloading Fiona-1.8.9.post1.tar.gz (1.2 MB)
  Downloading Fiona-1.8.9.tar.gz (1.2 MB)
  Downloading Fiona-1.8.8.tar.gz (1.7 MB)
  Downloading Fiona-1.8.7.tar.gz (1.7 MB)
  Downloading Fiona-1.8.6.tar.gz (1.7 MB)
  Downloading Fiona-1.8.5.tar.gz (1.7 MB)
  Downloading Fiona-1.8.4.tar.gz (1.1 MB)
  Downloading Fiona-1.8.3.tar.gz (1.1 MB)
  Downloading Fiona-1.8.2.tar.gz (1.2 MB)
  Downloading Fiona-1.8.1.tar.gz (1.1 MB)
  Downloading Fiona-1.8.0.tar.gz (1.4 MB)
Collecting geopandas
  Downloading geopandas-0.8.2-py2.py3-none-any.whl (962 kB)
Collecting shapely
  Downloading Shapely-1.7.1-cp39-cp39-win_amd64.whl (978 kB)
Collecting fiona
  Downloading Fiona-1.7.13.tar.gz (731 kB)
Collecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp39-cp39-win_amd64.whl (6.2 MB)
Collecting cligj>=0.4
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting click-p


    A GDAL API version must be specified. Provide a path to gdal-config using a GDAL_CONFIG environment variable or use a GDAL_VERSION environment variable.
    ----------------------------------------
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-exm9v534\\fiona_48a8cf5f0bad4e4e84db9c38bb6d1cde\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-exm9v534\\fiona_48a8cf5f0bad4e4e84db9c38bb6d1cde\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip

**Install various libraries**

In [5]:
#Install necessary libraries.
install_list=[
  "!pip install mlbox",
  "!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o",
  "!pip install --upgrade pip",
  "user$ conda install -c h2oai h2o",
  "!python3 -m pip install --upgrade pip",
  "!pip3 install auto-sklearn",
  "!pip3 install --upgrade scipy",
  "!pip3 install --upgrade auto-sklearn",
  "!pip install auto-sklearn==0.10.0",
  "!sudo apt-get install build-essential swig",
  "!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install", 
  "!pip install auto-sklearn==0.10.0",
  "!python3 -m pip install -U pip",
  "!python3 -m pip install -U setuptools wheel",
  "!python3 -m pip install -U 'mxnet<2.0.0'",
  "!python3 -m pip install autogluon",
  "!pip install matplotlib-venn",
  "!apt-get -qq install -y libfluidsynth1",
  "!pip install Pillow",
  "!pip uninstall PIL",
  "!pip uninstall Pillow",
  "!ypip install Pillow",
  "!pip3 install --upgrade pandas",
  "!pip install seaborn",
  "!pip install matplotlib",
  "!pip install --upgrade matplotlib",
  "!pip install geopandas",
  "!pip install autopytorch",
  "!pip install tpot",
  "!pip install ConfigSpace",
  "!pip install autokeras",
  "!pip install deltalake", #Installed but not used as of now.
  "sns.set_style(style='ticks')",
  "conda install -c conda-forge tpot",
  "conda install -c conda-forge tpot xgboost dask dask-ml scikit-mdr skrebate",
  "conda env create -f tpot-cuml.yml -n tpot-cuml",
  "conda activate tpot-cuml",
  "alpha, Type:UniformFloat, Range: [0.0, 1.0], Default: 0.5",
  "$ cat requirements.txt | xargs -n 1 -L 1 pip install",
  "$ python setup.py install",
  "$ cd examples/",
  "Optimiser()",
  "opt.evaluate(params, df)",
  "classmlbox.model.classification.StackingClassifier(base_estimators=[<mlbox.model.classification.classifier.Classifier object>, <mlbox.model.classification.classifier.Classifier object>, <mlbox.model.classification.classifier.Classifier object>], level_estimator=<Mock name='mock()' id='139653242018560'>, n_folds=5, copy=False, drop_first=True, random_state=1, verbose=True)",
  'pyinstaller -F --hidden-import="sklearn.utils._cython_blas" --hidden-import="sklearn.neighbors.typedefs" --hidden-import="sklearn.neighbors.quad_tree" --hidden-import="sklearn.tree._utils" Datamanager.py']
for each_command in install_list:
  if each_command:
    try:
      each_command  
    except IOError:
        print("Invalid command.") # Syntax error: invalid syntax.
  else:
    print("Search for another alternative")

In [6]:
!python3 -m pip install autogluon

Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


In [7]:
!python3 -m pip install --upgrade pip

Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


In [8]:
#Install autosklearn.
!pip3 install auto-sklearn

Collecting auto-sklearn

    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-ynysr0nk\\auto-sklearn_f7952d46e8ec4bdda0b14c05d6f4d205\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-ynysr0nk\\auto-sklearn_f7952d46e8ec4bdda0b14c05d6f4d205\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip-egg-info-losqosjf'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-ynysr0nk\auto-sklearn_f7952d46e8ec4bdda0b14c05d6f4d205\
    Complete output (5 lines):
    Traceback 


  Downloading auto-sklearn-0.14.0.tar.gz (6.3 MB)
  Downloading auto-sklearn-0.13.0.tar.gz (6.3 MB)
  Downloading auto-sklearn-0.12.7.tar.gz (6.3 MB)
  Downloading auto-sklearn-0.12.6.tar.gz (6.1 MB)
  Downloading auto-sklearn-0.12.5.tar.gz (6.1 MB)
  Downloading auto-sklearn-0.12.4.tar.gz (6.1 MB)
  Downloading auto-sklearn-0.12.3.tar.gz (6.1 MB)
  Downloading auto-sklearn-0.12.2.tar.gz (6.1 MB)
  Downloading auto-sklearn-0.12.1.tar.gz (6.1 MB)
  Downloading auto-sklearn-0.12.0.tar.gz (4.1 MB)
  Downloading auto-sklearn-0.11.1.tar.gz (3.9 MB)
  Downloading auto-sklearn-0.11.0.tar.gz (3.9 MB)
  Downloading auto-sklearn-0.10.0.tar.gz (4.1 MB)
  Downloading auto-sklearn-0.9.0.tar.gz (4.7 MB)
  Downloading auto-sklearn-0.8.0.tar.gz (4.6 MB)
  Downloading auto-sklearn-0.7.1.tar.gz (4.6 MB)
  Downloading auto-sklearn-0.7.0.tar.gz (4.6 MB)


        raise ValueError(
    ValueError: Detected unsupported operating system: win32. Please check the compability information of auto-sklearn: https://automl.github.io/auto-sklearn/master/installation.html#windows-osx-compatibility
    ----------------------------------------
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-ynysr0nk\\auto-sklearn_cdb04a017ec543228cb6f0d68f7f536e\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-ynysr0nk\\auto-sklearn_cdb04a017ec543228cb6f0d68f7f536e\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file_

In [9]:
!pip3 install --upgrade scipy
!pip3 install --upgrade auto-sklearn
!pip install auto-sklearn==0.10.0

!sudo apt-get install build-essential swig 
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install 
!pip install auto-sklearn==0.10.0

!pip install matplotlib-venn
!apt-get -qq install -y libfluidsynth1

  Downloading auto-sklearn-0.6.0.tar.gz (3.9 MB)
  Downloading auto-sklearn-0.5.2.tar.gz (3.4 MB)
  Downloading auto-sklearn-0.5.1.tar.gz (3.4 MB)
  Downloading auto-sklearn-0.5.0.tar.gz (3.4 MB)
  Downloading auto-sklearn-0.4.2.tar.gz (3.4 MB)
  Downloading auto-sklearn-0.4.1.tar.gz (3.4 MB)
  Downloading auto-sklearn-0.4.0.tar.gz (3.4 MB)
  Downloading auto-sklearn-0.3.0.tar.gz (3.4 MB)
  Downloading auto-sklearn-0.2.1.tar.gz (3.7 MB)
  Downloading auto-sklearn-0.2.0.tar.gz (3.6 MB)
  Downloading auto-sklearn-0.1.3.tar.gz (1.3 MB)
  Downloading auto-sklearn-0.1.2.tar.gz (5.9 MB)
  Downloading auto-sklearn-0.1.1.tar.gz (5.9 MB)
  Downloading auto-sklearn-0.1.0.tar.gz (5.9 MB)
  Downloading auto-sklearn-0.0.2.tar.gz (5.9 MB)
  Downloading auto-sklearn-0.0.1.tar.gz (5.9 MB)



    ----------------------------------------
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-ynysr0nk\\auto-sklearn_0e91f721f52144fdb859b3c68dd54f83\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-ynysr0nk\\auto-sklearn_0e91f721f52144fdb859b3c68dd54f83\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip-egg-info-j4dvmt06'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-ynysr0nk\auto-sklearn_0e91f721f52144fdb859b3c68dd54f83\



You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Collecting auto-sklearn


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-jci4a3fv\\auto-sklearn_7e3f9621677c4f08a12f348cbaa25323\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-jci4a3fv\\auto-sklearn_7e3f9621677c4f08a12f348cbaa25323\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip-egg-info-abtqurxl'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-jci4a3fv\auto-sklearn_7e3f9621677c4f08a12f348cbaa25323\
    Complete output (5 lines):
    Traceback 

  Using cached auto-sklearn-0.14.0.tar.gz (6.3 MB)
  Using cached auto-sklearn-0.13.0.tar.gz (6.3 MB)
  Using cached auto-sklearn-0.12.7.tar.gz (6.3 MB)
  Using cached auto-sklearn-0.12.6.tar.gz (6.1 MB)
  Using cached auto-sklearn-0.12.5.tar.gz (6.1 MB)
  Using cached auto-sklearn-0.12.4.tar.gz (6.1 MB)
  Using cached auto-sklearn-0.12.3.tar.gz (6.1 MB)
  Using cached auto-sklearn-0.12.2.tar.gz (6.1 MB)
  Using cached auto-sklearn-0.12.1.tar.gz (6.1 MB)
  Using cached auto-sklearn-0.12.0.tar.gz (4.1 MB)
  Using cached auto-sklearn-0.11.1.tar.gz (3.9 MB)
  Using cached auto-sklearn-0.11.0.tar.gz (3.9 MB)
  Using cached auto-sklearn-0.10.0.tar.gz (4.1 MB)
  Using cached auto-sklearn-0.9.0.tar.gz (4.7 MB)
  Using cached auto-sklearn-0.8.0.tar.gz (4.6 MB)
  Using cached auto-sklearn-0.7.1.tar.gz (4.6 MB)
  Using cached auto-sklearn-0.7.0.tar.gz (4.6 MB)
  Using cached auto-sklearn-0.6.0.tar.gz (3.9 MB)
  Using cached auto-sklearn-0.5.2.tar.gz (3.4 MB)
  Using cached auto-sklearn-0.5.1.tar

      File "C:\Users\Administrator\AppData\Local\Temp\pip-install-jci4a3fv\auto-sklearn_7e3f9621677c4f08a12f348cbaa25323\setup.py", line 9, in <module>
        raise ValueError(
    ValueError: Detected unsupported operating system: win32. Please check the compability information of auto-sklearn: https://automl.github.io/auto-sklearn/master/installation.html#windows-osx-compatibility
    ----------------------------------------
    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-jci4a3fv\\auto-sklearn_c3ce7a7602c048ef9c7430c03f8beb67\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-jci4a3fv\\auto-sklearn_c3ce7a7602c048ef9c7430c03f8beb67\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else

Collecting auto-sklearn==0.10.0


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-crt31cg4\\auto-sklearn_6901b14e52af4e05b77a0f1fc18e4963\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-crt31cg4\\auto-sklearn_6901b14e52af4e05b77a0f1fc18e4963\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip-egg-info-rhz5j8zn'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-crt31cg4\auto-sklearn_6901b14e52af4e05b77a0f1fc18e4963\
    Complete output (5 lines):
    Traceback 

  Using cached auto-sklearn-0.10.0.tar.gz (4.1 MB)


'sudo' is not recognized as an internal or external command,
operable program or batch file.
'xargs' is not recognized as an internal or external command,
operable program or batch file.


Collecting auto-sklearn==0.10.0
  Using cached auto-sklearn-0.10.0.tar.gz (4.1 MB)


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-cj0acgjn\\auto-sklearn_c83b800177794d07a48ee7f395852574\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-cj0acgjn\\auto-sklearn_c83b800177794d07a48ee7f395852574\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip-egg-info-dn10ri_4'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-cj0acgjn\auto-sklearn_c83b800177794d07a48ee7f395852574\
    Complete output (5 lines):
    Traceback 

Collecting matplotlib-venn
  Downloading matplotlib-venn-0.11.6.tar.gz (29 kB)
Collecting matplotlib
  Downloading matplotlib-3.4.3-cp39-cp39-win_amd64.whl (7.1 MB)
Collecting pillow>=6.2.0
  Downloading Pillow-8.3.2-cp39-cp39-win_amd64.whl (3.2 MB)
Collecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.2-cp39-cp39-win_amd64.whl (52 kB)
Building wheels for collected packages: matplotlib-venn
  Building wheel for matplotlib-venn (setup.py): started
  Building wheel for matplotlib-venn (setup.py): finished with status 'done'
  Created wheel for matplotlib-venn: filename=matplotlib_venn-0.11.6-py3-none-any.whl size=32065 sha256=b902eeaa5921c9004a68619936dc17b69982bdec48a87fdc0965cb5e58f70eb3
  Stored in directory: c:\users\administrator\appdata\local\pip\cache\wheels\42\1c\65\1a733895cd94885d1a97eee84ec7595ff7fddc277a45bcacf5
Successfully built matplotlib-venn
Installing collected packages: pillow, kiwiso

'apt-get' is not recognized as an internal or external command,
operable program or batch file.


In [10]:
!pip install geopandas
!pip3 uninstall statsmodels

Collecting geopandas

    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-6a_hvha8\\fiona_57ea58f7ce9a48698452aa98b12aa218\\setup.py'"'"'; __file__='"'"'C:\\Users\\Administrator\\AppData\\Local\\Temp\\pip-install-6a_hvha8\\fiona_57ea58f7ce9a48698452aa98b12aa218\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Administrator\AppData\Local\Temp\pip-pip-egg-info-rrekmhs0'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-6a_hvha8\fiona_57ea58f7ce9a48698452aa98b12aa218\
    Complete output (1 lines):
    A GDAL API version must be spec


  Using cached geopandas-0.9.0-py2.py3-none-any.whl (994 kB)
Collecting fiona>=1.8
  Using cached Fiona-1.8.20.tar.gz (1.3 MB)
  Using cached Fiona-1.8.19.tar.gz (1.3 MB)
  Using cached Fiona-1.8.18.tar.gz (1.3 MB)
  Using cached Fiona-1.8.17.tar.gz (1.3 MB)
  Using cached Fiona-1.8.16.tar.gz (1.3 MB)
  Using cached Fiona-1.8.15.tar.gz (1.3 MB)
  Using cached Fiona-1.8.14.tar.gz (1.3 MB)
  Using cached Fiona-1.8.13.post1.tar.gz (1.2 MB)
  Using cached Fiona-1.8.13.tar.gz (1.2 MB)
  Using cached Fiona-1.8.12.tar.gz (1.2 MB)
  Using cached Fiona-1.8.11.tar.gz (1.2 MB)
  Using cached Fiona-1.8.10.tar.gz (1.2 MB)
  Using cached Fiona-1.8.9.post2.tar.gz (1.2 MB)
  Using cached Fiona-1.8.9.post1.tar.gz (1.2 MB)
  Using cached Fiona-1.8.9.tar.gz (1.2 MB)
  Using cached Fiona-1.8.8.tar.gz (1.7 MB)
  Using cached Fiona-1.8.7.tar.gz (1.7 MB)
  Using cached Fiona-1.8.6.tar.gz (1.7 MB)
  Using cached Fiona-1.8.5.tar.gz (1.7 MB)
  Using cached Fiona-1.8.4.tar.gz (1.1 MB)
  Using cached Fiona-1.8.3



**Install MLBox and H2O**

In [11]:
!pip install mlbox
!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

Collecting mlbox
  Downloading mlbox-0.8.5.tar.gz (31 kB)
Collecting numpy==1.18.2
  Downloading numpy-1.18.2.zip (5.4 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe' 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\lib\site-packages\pip\_vendor\pep517\in_process\_in_process.py' prepare_metadata_for_build_wheel 'C:\Users\ADMINI~1\AppData\Local\Temp\tmp62ywz1pn'
         cwd: C:\Users\Administrator\AppData\Local\Temp\pip-install-el6j84ou\numpy_dba93ea81e3e4141af63b578863c27af
    Complete output (195 lines):
    Running from numpy source directory.
      run_build = parse_setuppy_commands()


  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'error'
Collecting mlbox
  Downloading mlbox-0.8.4.tar.gz (31 kB)
  Downloading mlbox-0.8.3.tar.gz (31 kB)
Collecting numpy==1.17.0
  Downloading numpy-1.17.0.zip (6.5 MB)
Collecting scipy==1.3.0
  Downloading scipy-1.3.0.tar.gz (23.6 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
Collecting mlbox
  Downloading mlbox-0.8.2.tar.gz (30 kB)
  Downloading mlbox-0.8.1.tar.gz (31 kB)
  Downloading mlbox-0.8.0.tar.gz (31 kB)
Collecting numpy==1.16.3
  Downloading numpy-1.16.3.zip (5.1 MB)
Collecting scipy==1.2.1
  Downloading scipy-1.2.1.tar.gz (23.1 MB)
Collecting matplotlib==2.2.4
  Downloading matplotlib-2.2.4.tar.gz (37.0 MB)
Collecting mlbox
  Downloading mlbox-0.7.0.tar.gz (31 kB)
  Downloading mlbox-0.6.2.tar.gz (32 kB)
Collecting hyperopt==0.1
  Downloading hyperopt-0.1.ta

    Cythonizing sources
    Processing numpy/random\_bounded_integers.pxd.in
    Processing numpy/random\mtrand.pyx
    Processing numpy/random\_bit_generator.pyx
    Processing numpy/random\_bounded_integers.pyx.in
    Processing numpy/random\_common.pyx
    Processing numpy/random\_generator.pyx
    Processing numpy/random\_mt19937.pyx
    Processing numpy/random\_pcg64.pyx
    Processing numpy/random\_philox.pyx
    Processing numpy/random\_sfc64.pyx
    blas_opt_info:
    blas_mkl_info:
    No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
    customize MSVCCompiler
      libraries mkl_rt not found in ['C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\', 'C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python39\\libs']
      NOT AVAILABLE
    
    blis_info:
      libraries blis not found in ['C:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'C:\\', 'C:\\Users\\Administrat

Looking in links: http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html


You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Collecting h2o
  Downloading h2o-3.34.0.1.tar.gz (175.8 MB)
Collecting tabulate
  Downloading tabulate-0.8.9-py3-none-any.whl (25 kB)
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py): started
  Building wheel for h2o (setup.py): finished with status 'done'
  Created wheel for h2o: filename=h2o-3.34.0.1-py2.py3-none-any.whl size=175823552 sha256=ce2be89959f904f22edb9e4aea5acbef9ee1a86af526a63611e43f9107a6d95c
  Stored in directory: c:\users\administrator\appdata\local\pip\cache\wheels\aa\50\c6\a1d1e84160885a4bdaab1ae2a9cba26f3a35ab08409444616c
Successfully built h2o
Installing collected packages: tabulate, h2o
Successfully installed h2o-3.34.0.1 tabulate-0.8.9


In [12]:
!pip install seaborn
!pip install matplotlib
!pip install --upgrade matplotlib

Collecting seaborn


You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.11.2

You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.





Train and Test a Gradient Boosting Model (GBM) model:


In [13]:
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.kaggle.com/fatiimaezzahra/famous-iconic-women"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    kaggle_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=11)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(stats_train.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)

ModuleNotFoundError: No module named 'bs4'

In [None]:
#Performing Weather forecast using H2O:
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url="https://www.climate.gov/maps-data/datasets"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    weather_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = weather_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(weather_data.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)

In [None]:
#MLbox
from mlbox.optimisation import Optimiser, Regressor
import pandas as pd
import requests
from bs4 import BeautifulSoup
#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url="https://www.kaggle.com/mattiuzc/stock-exchange-data"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    stock_data[each_new_col] = stock_data[each_new_col].map(int)
    X, y = stock_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(stock_data.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    
    #Evaluate the pipeline.
    opt = Optimiser()
    params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}

    df = {"train" : pd.DataFrame(dataset.data), "target" : pd.Series(dataset.target)}
    opt.evaluate(params, df)

In [None]:
#MNIST dataset
from mlbox.optimisation import Optimiser, Regressor
import pandas as pd
mnist_train_data=pd.read_csv("/content/sample_data/mnist_train_small.csv")
mnist_test_data=pd.read_csv("/content/sample_data/mnist_test.csv")
mnist_data = pd.merge(mnist_train_data, mnist_test_data)
#Load the data.
dataset = mnist_data
#Evaluate the pipeline.
opt = Optimiser()
params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}
df = {"train" : pd.DataFrame(mnist_train_data.iloc[:,:-1]), "target" : pd.Series(mnist_test_data.iloc[:,-1])}

#Build a keras model.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential()
#ReLU: Rectified Linear Unit.
#Add a densely-connected layer with 64 units to the model.
model.add(keras.layers.Dense(64, activation='relu'))
#Add another.
model.add(keras.layers.Dense(64, activation='relu'))
#Add a softmax layer with 10 output units.
model.add(keras.layers.Dense(10, activation='softmax'))
#Define a ConvModel.
class ConvModel(tf.keras.Model):
    def __init__(self, nfs, input_shape, output_shape, use_bn=False, use_dp=False):
        super(ConvModel, self).__init__(name='mlp')
        self.use_bn = use_bn
        self.use_dp = use_dp
        self.num_classes = num_classes

        #Backbone layers
        self.convs = [ConvLayer(nfs[0], s=1, input_shape=input_shape)]
        self.convs += [ConvLayer(nf) for nf in nfs[1:]]
        #Classification layers
        self.convs.append(AveragePooling2D())
        self.convs.append(Dense(output_shape, activation='softmax'))

    def call(self, inputs):
        for layer in self.convs: inputs = layer(inputs)
        return inputs
#Compile the model.
model.compile(loss='categorical crossentropy', metrics=['accuracy'], optimizer='rmsprop')
model.build((None, 32, 32, 3))

model.summary()

import requests
from bs4 import BeautifulSoup
#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator

#Famous iconic women dataset
url = "https://www.kaggle.com/fatiimaezzahra/famous-iconic-women"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    kaggle_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=11)
    
    history = model.fit(x_train, y_train,
                    batch_size=64,
                    epochs=1)

    model.summary()
    input_shape = (2, 3, 4)
    x1 = tf.random.normal(input_shape)
    x2 = tf.random.normal(input_shape)
    y = tf.keras.layers.Add()([x1, x2])
    print(y.shape)

    tf.keras.layers.LSTM(3, activation='tanh', recurrent_activation='sigmoid',
        use_bias=True, kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        bias_initializer='zeros', unit_forget_bias=True, dropout=0.0, recurrent_dropout=0.0,
        return_sequences=False, return_state=False, go_backwards=False, stateful=False,
        time_major=False, unroll=False)

    #Define a ConvLayer.
    class ConvLayer(Layer) :
        def __init__(self, nf, ks=3, s=2, **kwargs):
            self.nf = nf
            self.grelu = GeneralReLU(leak=0.01)
            self.conv = (Conv2D(filters     = nf,
                                kernel_size = ks,
                                strides     = s,
                                padding     = "same",
                                use_bias    = False,
                                activation  = "linear"))
            super(ConvLayer, self).__init__(**kwargs)

        def rsub(self): return -self.grelu.sub
        def set_sub(self, v): self.grelu.sub = -v
        def conv_weights(self): return self.conv.weight[0]

        def build(self, input_shape):
            #No weight to train.
            super(ConvLayer, self).build(input_shape)  #Be sure to call this at the end.

        def compute_output_shape(self, input_shape):
            output_shape = (input_shape[0],
                            input_shape[1]/2,
                            input_shape[2]/2,
                            self.nf)
            return output_shape

        def call(self, x):
            return self.grelu(self.conv(x))

        def __repr__(self):
            return f'ConvLayer(nf={self.nf}, activation={self.grelu})'

    opt.evaluate(params, df)

In [None]:
#Outbrain Click Prediction dataset
from mlbox.optimisation import Optimiser, Regressor
import pandas as pd
clicks_train_data=pd.read_csv("C:/Users/Administrator/OneDrive - Bitwise Solutions Private Limited/Documents/AutoML/OutbrainClickPrediction/clicks_train.csv")
clicks_test_data=pd.read_csv("C:/Users/Administrator/OneDrive - Bitwise Solutions Private Limited/Documents/AutoML/OutbrainClickPrediction/clicks_test.csv")
clicks_data = pd.merge(clicks_train_data, clicks_test_data)
#Load the data.
dataset = clicks_data
#Evaluate the pipeline.
opt = Optimiser()
params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}
df = {"train" : pd.DataFrame(clicks_train_data.iloc[:,:-1]), "target" : pd.Series(clicks_test_data.iloc[:,-1])}

#Build a keras model.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential()
#ReLU: Rectified Linear Unit.
#Add a densely-connected layer with 64 units to the model.
model.add(keras.layers.Dense(64, activation='relu'))
#Add another.
model.add(keras.layers.Dense(64, activation='relu'))
#Add a softmax layer with 10 output units.
model.add(keras.layers.Dense(10, activation='softmax'))
#Define a ConvModel.
class ConvModel(tf.keras.Model):
    def __init__(self, nfs, input_shape, output_shape, use_bn=False, use_dp=False):
        super(ConvModel, self).__init__(name='mlp')
        self.use_bn = use_bn
        self.use_dp = use_dp
        self.num_classes = num_classes

        #Backbone layers
        self.convs = [ConvLayer(nfs[0], s=1, input_shape=input_shape)]
        self.convs += [ConvLayer(nf) for nf in nfs[1:]]
        #Classification layers
        self.convs.append(AveragePooling2D())
        self.convs.append(Dense(output_shape, activation='softmax'))

    def call(self, inputs):
        for layer in self.convs: inputs = layer(inputs)
        return inputs
#Compile the model.
model.compile(loss='categorical crossentropy', metrics=['accuracy'], optimizer='rmsprop')
model.build((None, 32, 32, 3))

model.summary()

import requests
from bs4 import BeautifulSoup
#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator

url = "https://www.kaggle.com/c/outbrain-click-prediction/data"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    kaggle_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=11)
    
    history = model.fit(x_train, y_train,
                    batch_size=64,
                    epochs=1)

    model.summary()
    input_shape = (2, 3, 4)
    x1 = tf.random.normal(input_shape)
    x2 = tf.random.normal(input_shape)
    y = tf.keras.layers.Add()([x1, x2])
    print(y.shape)

    tf.keras.layers.LSTM(3, activation='tanh', recurrent_activation='sigmoid',
        use_bias=True, kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        bias_initializer='zeros', unit_forget_bias=True, dropout=0.0, recurrent_dropout=0.0,
        return_sequences=False, return_state=False, go_backwards=False, stateful=False,
        time_major=False, unroll=False)

    #Define a ConvLayer.
    class ConvLayer(Layer) :
        def __init__(self, nf, ks=3, s=2, **kwargs):
            self.nf = nf
            self.grelu = GeneralReLU(leak=0.01)
            self.conv = (Conv2D(filters     = nf,
                                kernel_size = ks,
                                strides     = s,
                                padding     = "same",
                                use_bias    = False,
                                activation  = "linear"))
            super(ConvLayer, self).__init__(**kwargs)

        def rsub(self): return -self.grelu.sub
        def set_sub(self, v): self.grelu.sub = -v
        def conv_weights(self): return self.conv.weight[0]

        def build(self, input_shape):
            #No weight to train.
            super(ConvLayer, self).build(input_shape)  # Be sure to call this at the end

        def compute_output_shape(self, input_shape):
            output_shape = (input_shape[0],
                            input_shape[1]/2,
                            input_shape[2]/2,
                            self.nf)
            return output_shape

        def call(self, x):
            return self.grelu(self.conv(x))

        def __repr__(self):
            return f'ConvLayer(nf={self.nf}, activation={self.grelu})'

    opt.evaluate(params, df)

Image Prediction with AutoGluon

In [None]:
#Image Prediction with AutoGluon
#Import AutoGluon.
%matplotlib inline
import autogluon.core as ag
from autogluon.vision import ImageDataset
import pandas as pd
#Celeb faces (celebA) dataset
import requests
from bs4 import BeautifulSoup
#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.kaggle.com/jessicali9530/celeba-dataset"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    olympics2021_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1221)
    csv_file_list=["list_attr_celeba.csv", "list_bbox_celeba.csv", "list_eval_partition.csv", "list_landmarks_align_celeba.csv"]
    for each_csv_file in csv_file_list:
      csv_file = ag.utils.download(each_csv_file)
      df = pd.read_csv(csv_file)
      df.head()
      df = ImageDataset.from_csv(csv_file)
      df.head()
    train_data, _, test_data = ImageDataset.from_folders("img_align_celeba.zip", train='train', test='test')
    print('train #', len(train_data), 'test #', len(test_data))
    train_data.head()
    #Load the splits with from_folder.
    root = os.path.join(os.path.dirname(train_data.iloc[0]['image']), '..')
    all_data = ImageDataset.from_folder(root)
    all_data.head()
    #Split the dataset.
    train, val, test = all_data.random_split(val_size=0.1, test_size=0.1)
    print('train #:', len(train), 'test #:', len(test))
    #Convert a list of images to dataset.
    celeba = ag.utils.download("img_align_celeba.zip")
    celeba = ag.utils.unzip(celeba)
    image_list = [x for x in os.listdir(os.path.join(pets, 'images')) if x.endswith('jpg')]
    new_data = ImageDataset.from_name_func(image_list, label_fn, root=os.path.join(os.getcwd(), celeba, 'images'))
    new_data
    #Visualize the images.
    new_data.show_images()
    #Image prediction
    import autogluon.core as ag
    from autogluon.vision import ImagePredictor, ImageDataset
    train_dataset, _, test_dataset = ImageDataset.from_folders("img_align_celeba.zip")
    print(train_dataset)
    #Fit a classifier.
    predictor = ImagePredictor()
    #Since the original dataset does not provide validation split, the `fit` function splits it randomly with 90/10 ratio.
    predictor.fit(train_dataset, hyperparameters={'epochs': 2})
    #The best Top-1 accuracy achieved on the validation set is:
    fit_result = predictor.fit_summary()
    print('Top-1 train acc: %.3f, val acc: %.3f' %(fit_result['train_acc'], fit_result['valid_acc']))
    #Predict on a new image.
    image_path = test_dataset.iloc[0]['image']
    result = predictor.predict(image_path)
    print(result)
    bulk_result = predictor.predict(test_dataset)
    print(bulk_result)
    #Generate image features with a classifier.
    image_path = test_dataset.iloc[0]['image']
    feature = predictor.predict_feature(image_path)
    print(feature)
    #Validate and test top-1 accuracy.
    test_acc = predictor.evaluate(test_dataset)
    print('Top-1 test acc: %.3f' % test_acc['top1'])
    #Save and load the classifiers.
    filename = 'predictor.ag'
    predictor.save(filename)
    predictor_loaded = ImagePredictor.load(filename)
    #Use predictor_loaded as usual.
    result = predictor_loaded.predict(image_path)
    print(result)
    #Use AutoGluon to produce an ImagePredictor to classify images.
    import autogluon.core as ag
    from autogluon.vision import ImagePredictor, ImageDataset
    train_data, _, test_data = ImageDataset.from_folders("img_align_celeba.zip")
    model = ag.Categorical('resnet18_v1b', 'mobilenetv3_small')
    model_list = ImagePredictor.list_models()
    #Specify the training hyper-parameters.
    batch_size = 8
    lr = ag.Categorical(1e-2, 1e-3)
    #Bayesian Optimization
    hyperparameters={'model': model, 'batch_size': batch_size, 'lr': lr, 'epochs': 2}
    predictor = ImagePredictor()
    predictor.fit(train_data, time_limit=60*10, hyperparameters=hyperparameters,
                  hyperparameter_tune_kwargs={'searcher': 'bayesopt', 'num_trials': 2})
    print('Top-1 val acc: %.3f' % predictor.fit_summary()['valid_acc'])
    #Load the test dataset and evaluate.
    results = predictor.evaluate(test_data)
    print('Test acc on hold-out data:', results)

In [None]:
#Install.
!pip install torch
#Upgrade pytorch.
!pip install --upgrade torch torchvision
#Install H2O.
!pip install h2o
#Install AutoKeras.
!pip install autokeras
#Upgrade TensorFlow
!pip install --ignore-installed --upgrade tensorflow
#Install PyTorch.
!pip install pytorch

In [None]:
import pandas as pd 
#AutoGluon
subsample_size = 2000  
feature_columns = ['Product_Description', 'Product_Type']
label = 'Sentiment'

train_df = pd.read_csv('Participants_Data.zip', index_col=0).sample(2000, random_state=123)
dev_df = pd.read_csv('Participants_Data.zip', index_col=0)
test_df = pd.read_csv('Participants_Data.zip', index_col=0)

train_df = train_df[feature_columns + [label]]
dev_df = dev_df[feature_columns + [label]]
test_df = test_df[feature_columns]
print('Number of training samples:', len(train_df))
print('Number of dev samples:', len(dev_df))
print('Number of test samples:', len(test_df))
train_df.head()
dev_df.head()
test_df.head()
from autogluon.tabular import TabularPredictor
predictor = TabularPredictor(label='Sentiment', path='ag_tabular_product_sentiment_multimodal')
predictor.fit(train_df, hyperparameters='multimodal')
predictor.leaderboard(dev_df)
#Improve predictive performance by using stack ensembling.
predictor.fit(train_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1)


In [None]:
#NFL Big Data Bowl 2022 kaggle dataset
import numpy as np 
import pandas as pd
from autokeras import StructuredDataClassifier
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup
import tensorflow as tf
import autokeras as ak

#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.kaggle.com/c/nfl-big-data-bowl-2022/data"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    nfl_2022_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1221)
    csv_file_list=["PFFScoutingData.csv", "games.csv", "players.csv", "plays.csv", "tracking2018.csv", "tracking2019.csv", "tracking2020.csv"]
    for each_csv_file in csv_file_list:
      csv_file = ag.utils.download(each_csv_file)
      df = pd.read_csv(csv_file)
      df.head()
      df = ImageDataset.from_csv(csv_file)
      df.head()

#StructuredDataClassifier
autokeras.StructuredDataClassifier(
    column_names=None,
    column_types=None,
    num_classes=None,
    multi_label=False,
    loss=None,
    metrics=None,
    project_name="structured_data_classifier",
    max_trials=100,
    directory=None,
    objective="val_accuracy",
    tuner=None,
    overwrite=False,
    seed=None,
    max_model_size=None,
    **kwargs)

#Fit.
StructuredDataClassifier.fit(
    x=None, y=None, epochs=None, callbacks=None, validation_split=0.2, validation_data=None, **kwargs)
#Predict.
StructuredDataClassifier.predict(x, **kwargs)
#Evaluate.
StructuredDataClassifier.evaluate(x, y=None, **kwargs)
#Export the model using export_model.
StructuredDataClassifier.export_model()

In [None]:
#MNIST
from mlbox.optimisation import Optimiser, Regressor
import pandas as pd
mnist_train_data=pd.read_csv("/content/sample_data/mnist_train_small.csv")
mnist_test_data=pd.read_csv("/content/sample_data/mnist_test.csv")
mnist_data = pd.merge(mnist_train_data, mnist_test_data)
#Load the data.
dataset = mnist_data
#Evaluate the pipeline.
opt = Optimiser()
params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}
df = {"train" : pd.DataFrame(mnist_train_data.iloc[:,:-1]), "target" : pd.Series(mnist_test_data.iloc[:,-1])}

#Build a keras model.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential()
#ReLU: Rectified Linear Unit.
#Adds a densely-connected layer with 64 units to the model.
model.add(keras.layers.Dense(64, activation='relu'))
#Add another.
model.add(keras.layers.Dense(64, activation='relu'))
#Add a softmax layer with 10 output units.
model.add(keras.layers.Dense(10, activation='softmax'))
#Define a ConvModel.
class ConvModel(tf.keras.Model):
    def __init__(self, nfs, input_shape, output_shape, use_bn=False, use_dp=False):
        super(ConvModel, self).__init__(name='mlp')
        self.use_bn = use_bn
        self.use_dp = use_dp
        self.num_classes = num_classes

        #Backbone layers
        self.convs = [ConvLayer(nfs[0], s=1, input_shape=input_shape)]
        self.convs += [ConvLayer(nf) for nf in nfs[1:]]
        #Classification layers.
        self.convs.append(AveragePooling2D())
        self.convs.append(Dense(output_shape, activation='softmax'))

    def call(self, inputs):
        for layer in self.convs: inputs = layer(inputs)
        return inputs
#Compile the model.
model.compile(loss='categorical crossentropy', metrics=['accuracy'], optimizer='rmsprop')
model.build((None, 32, 32, 3))
model.summary()

#Olympics 2021 dataset
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
#Dataset's URL:
url = "https://www.kaggle.com/arjunprasadsarkhel/2021-olympics-in-tokyo"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    olympics2021_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=11)
    
    history = model.fit(x_train, y_train,
                    batch_size=64,
                    epochs=1)

    model.summary()
    input_shape = (2, 3, 4)
    x1 = tf.random.normal(input_shape)
    x2 = tf.random.normal(input_shape)
    y = tf.keras.layers.Add()([x1, x2])
    print(y.shape)

    tf.keras.layers.LSTM(3, activation='tanh', recurrent_activation='sigmoid',
        use_bias=True, kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        bias_initializer='zeros', unit_forget_bias=True, dropout=0.0, recurrent_dropout=0.0,
        return_sequences=False, return_state=False, go_backwards=False, stateful=False,
        time_major=False, unroll=False)

    #Define a ConvLayer.
    class ConvLayer(Layer) :
        def __init__(self, nf, ks=3, s=2, **kwargs):
            self.nf = nf
            self.grelu = GeneralReLU(leak=0.01)
            self.conv = (Conv2D(filters     = nf,
                                kernel_size = ks,
                                strides     = s,
                                padding     = "same",
                                use_bias    = False,
                                activation  = "linear"))
            super(ConvLayer, self).__init__(**kwargs)

        def rsub(self): return -self.grelu.sub
        def set_sub(self, v): self.grelu.sub = -v
        def conv_weights(self): return self.conv.weight[0]

        def build(self, input_shape):
            #No weight to train.
            super(ConvLayer, self).build(input_shape)  # Be sure to call this at the end

        def compute_output_shape(self, input_shape):
            output_shape = (input_shape[0],
                            input_shape[1]/2,
                            input_shape[2]/2,
                            self.nf)
            return output_shape

        def call(self, x):
            return self.grelu(self.conv(x))

        def __repr__(self):
            return f'ConvLayer(nf={self.nf}, activation={self.grelu})'

    opt.evaluate(params, df)

In [None]:
import pandas as pd
dataset_dict={"New York City Airport Activity": "https://www.kaggle.com/sveneschlbeck/new-york-city-airport-activity?select=nyc-flights.csv"}
trained_model = automl(**dataset_dict)
print(trained_model)

In [None]:
import requests
from bs4 import BeautifulSoup
#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.kaggle.com/andrewmvd/heart-failure-clinical-data"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    olympics2021_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1001)
    
    history = model.fit(x_train, y_train,
                    batch_size=64,
                    epochs=1000)

    model.summary()
    input_shape = (2, 3, 4)
    x1 = tf.random.normal(input_shape)
    x2 = tf.random.normal(input_shape)
    y = tf.keras.layers.Add()([x1, x2])
    print(y.shape)

    tf.keras.layers.LSTM(3, activation='tanh', recurrent_activation='sigmoid',
        use_bias=True, kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        bias_initializer='zeros', unit_forget_bias=True, dropout=0.0, recurrent_dropout=0.0,
        return_sequences=False, return_state=False, go_backwards=False, stateful=False,
        time_major=False, unroll=False)

    #Define a ConvLayer.
    class ConvLayer(Layer) :
        def __init__(self, nf, ks=3, s=2, **kwargs):
            self.nf = nf
            self.grelu = GeneralReLU(leak=0.01)
            self.conv = (Conv2D(filters     = nf,
                                kernel_size = ks,
                                strides     = s,
                                padding     = "same",
                                use_bias    = False,
                                activation  = "linear"))
            super(ConvLayer, self).__init__(**kwargs)

        def rsub(self): return -self.grelu.sub
        def set_sub(self, v): self.grelu.sub = -v
        def conv_weights(self): return self.conv.weight[0]

        def build(self, input_shape):
            #No weight to train.
            super(ConvLayer, self).build(input_shape)  # Be sure to call this at the end

        def compute_output_shape(self, input_shape):
            output_shape = (input_shape[0],
                            input_shape[1]/2,
                            input_shape[2]/2,
                            self.nf)
            return output_shape

        def call(self, x):
            return self.grelu(self.conv(x))

        def __repr__(self):
            return f'ConvLayer(nf={self.nf}, activation={self.grelu})'

    opt.evaluate(params, df)

In [None]:
#Explore California housing dataset using MLBox.
from mlbox.optimisation import Optimiser, Regressor
import pandas as pd
cal_house_train_data=pd.read_csv("/content/sample_data/california_housing_train.csv")
cal_house_test_data=pd.read_csv("/content/sample_data/california_housing_test.csv")
cal_house_data = pd.merge(cal_house_train_data, cal_house_test_data)
#Load the data.
dataset = cal_house_data
#Evaluate the pipeline.
opt = Optimiser()
params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}
df = {"train" : pd.DataFrame(cal_house_train_data.iloc[:,:-1]), "target" : pd.Series(cal_house_test_data.iloc[:,-1])}
opt.evaluate(params, df)

In [None]:
#Explore Amazon dataset using MLBox.
from mlbox.optimisation import Optimiser, Regressor
import pandas as pd
amazon_train_data=pd.read_csv("C:\Users\Administrator\OneDrive - Bitwise Solutions Private Limited\Documents\AutoML\amazon 2\train.arff")
amazon_test_data=pd.read_csv("C:\Users\Administrator\OneDrive - Bitwise Solutions Private Limited\Documents\AutoML\amazon 2\test.arff")
amazon_data = pd.merge(amazon_train_data, amazon_test_data)
#Load the data.
dataset = amazon_data
#Evaluate the pipeline.
opt = Optimiser()
params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}
df = {"train" : pd.DataFrame(amazon_train_data.iloc[:,:-1]), "target" : pd.Series(amazon_test_data.iloc[:,-1])}
opt.evaluate(params, df)

In [None]:
#Explore the "Covid-19 in India" dataset.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable
url = "https://www.kaggle.com/sudalairajkumar/covid19-in-india"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
#Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
#Now convert the data into a pandas dataframe for further processing.
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)
state_data.head()
#Scraped data columns are of ‘string’ datatype.
#Convert them into ‘int' datatype.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"] = state_data["Deceased"].map(int)
#Present the data using Pretty table.
table = PrettyTable()
table.field_names = (new_cols)
for i in stats:
    table.add_row(i)
table.add_row(['','Total', 
               sum(state_data["Confirmed"]), 
               sum(state_data["Recovered"]),
               sum(state_data["Deceased"])])
print(table)
#Utilize Barplot to show total confirmed cases Statewise.
sns.set_style("ticks")
plt.figure(figsize=(15,10))
plt.barh(state_data["States/UT"], state_data["Confirmed"].map(int),
         align = 'center', color = 'lightblue', edgecolor = 'blue')
plt.xlabel('No. of Confirmed cases', fontsize = 18)
plt.ylabel('States/UT', fontsize = 18)
plt.gca().invert_yaxis() # This is to maintain the order in which the states appear
plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14)
plt.title('Total Confirmed Cases Statewise', fontsize = 20)
for index, value in enumerate(state_data["Confirmed"]):
    plt.text(value, index, str(value), fontsize = 12, verticalalignment = 'center')
plt.show()  
#Utilize donut chart representing nationwide total confirmed, cured and deceased cases.
group_size = [sum(state_data['Confirmed']), 
              sum(state_data['Recovered']), 
              sum(state_data['Deceased'])]
group_labels = ['Confirmed\n' + str(sum(state_data['Confirmed'])), 
                'Recovered\n' + str(sum(state_data['Recovered'])), 
                'Deceased\n'  + str(sum(state_data['Deceased']))]
custom_colors = ['skyblue','yellowgreen','tomato']
plt.figure(figsize = (5,5))
plt.pie(group_size, labels = group_labels, colors = custom_colors)
central_circle = plt.Circle((0,0), 0.5, color = 'white')
fig = plt.gcf()
fig.gca().add_artist(central_circle)
plt.rc('font', size = 12) 
plt.title('Nationwide total Confirmed, Recovered and Deceased Cases', fontsize = 16)
plt.show()

import fiona
#Read the shape file of map of India in GeoDataFrame.
map_data = gpd.read_file("Indian_States.shp")
map_data.rename(columns = {"st_nm":"States/UT"}, inplace = True)
map_data.head()
map_data["States/UT"] = map_data["States/UT"].str.replace("&","and")
map_data["States/UT"].replace("Arunanchal Pradesh",
                              "Arunachal Pradesh", inplace = True)
map_data["States/UT"].replace("Telangana", 
                              "Telengana", inplace = True)
map_data["States/UT"].replace("NCT of Delhi", 
                              "Delhi", inplace = True)
map_data["States/UT"].replace("Andaman and Nicobar Island", 
                              "Andaman and Nicobar Islands", 
                               inplace = True)
merged_data = pd.merge(map_data, state_data, 
                       how = "left", on = "States/UT")
merged_data.fillna(0, inplace = True)
merged_data.drop("Sr.No", axis = 1, inplace = True)
merged_data.head()

#MLbox:
from mlbox.optimisation import Optimiser
#Evaluate the pipeline.
opt = Optimiser()
params = {"ne__numerical_strategy" : 0, "ce__strategy" : "label_encoding", "fs__threshold" : 0.1, "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], "est__strategy" : "Linear"}
df = {"train" : pd.DataFrame(dataset.data), 
      "target" : pd.Series(dataset.target)}
opt.evaluate(params, df)

In [None]:
#Fashion MNIST dataset
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
#Explore the data.
train_images.shape
#Obtain the length of the train labels.
len(train_labels)

train_labels
#Get the shape of test images.
test_images.shape
#Obtain the length of the test labels.
len(test_labels)

#Preprocess the data.
plt.figure()
plt.imshow(train_images[0])
plt.colorbar()
plt.grid(False)
plt.show()

train_images = train_images / 255.0

test_images = test_images / 255.0

plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labels[i]])
plt.show()

#Build the model. Set up the layers.
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10)
])

#Compile the model.
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

#Train the model. Feed the model.
model.fit(train_images, train_labels, epochs=1000)

#Evaluate the accuracy of the model.
test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)
print('\nTest accuracy:', test_acc)

#Make the predictions.
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])
#Predict.
predictions = probability_model.predict(test_images)
predictions[0]
np.argmax(predictions[0])
test_labels[0]

#Look at the full set of 10 class predictions.
def plot_image(i, predictions_array, true_label, img):
  true_label, img = true_label[i], img[i]
  plt.grid(False)
  plt.xticks([])
  plt.yticks([])

  plt.imshow(img, cmap=plt.cm.binary)

  predicted_label = np.argmax(predictions_array)
  if predicted_label == true_label:
    color = 'blue'
  else:
    color = 'red'

  plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                100*np.max(predictions_array),
                                class_names[true_label]),
                                color=color)

def plot_value_array(i, predictions_array, true_label):
  true_label = true_label[i]
  plt.grid(False)
  plt.xticks(range(10))
  plt.yticks([])
  thisplot = plt.bar(range(10), predictions_array, color="#777777")
  plt.ylim([0, 1])
  predicted_label = np.argmax(predictions_array)

  thisplot[predicted_label].set_color('red')
  thisplot[true_label].set_color('blue')

#Verify the predictions.
i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], test_labels, test_images)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i],  test_labels)
plt.show()

i = 12
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], test_labels, test_images)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i],  test_labels)
plt.show()

#Plot the first X test images, their predicted labels, and the true labels.
#Color correct predictions in blue and incorrect predictions in red.
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot_image(i, predictions[i], test_labels, test_images)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_value_array(i, predictions[i], test_labels)
plt.tight_layout()
plt.show()

#Use the trained model.
#Grab an image from the test dataset.
img = test_images[1]
print(img.shape)
#Add the image to a batch where it's the only member.
img = (np.expand_dims(img,0))
print(img.shape)
#Now predict the correct label for this image

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable

#Specify URL for Web Scraping.
url = "https://www.mygov.in/corona-data/covid19-statewise-status/"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
#Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
#Now convert the data into a pandas dataframe for further processing.
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)
state_data.head()
#Scraped data columns are of ‘string’ datatype.
#Convert them into ‘int' datatype.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"] = state_data["Deceased"].map(int)

#Present the data using Pretty table.
table = PrettyTable()
table.field_names = (new_cols)
for i in stats:
    table.add_row(i)
table.add_row(['','Total', 
               sum(state_data["Confirmed"]), 
               sum(state_data["Recovered"]),
               sum(state_data["Deceased"])])
print(table)

In [None]:
from mlbox.preprocessing import Reader
from mlbox.preprocessing import Drift_thresholder
from mlbox.optimisation import Optimiser
from mlbox.prediction import Predictor

#Paths to the train set and the test set.
url = "https://www.kaggle.com/shivamb/netflix-shows"
#Name of the feature to predict.
#This columns should only be present in the train set.
target_name = "rating"

#Reading and cleaning all files
#Declare a reader for csv files
rd = Reader(sep=',')
#Return a dictionary containing three entries
# dict["train"] contains training samples withtout target columns
# dict["test"] contains testing elements withtout target columns
# dict["target"] contains target columns for training samples.
data = rd.train_test_split("https://www.kaggle.com/shivamb/netflix-shows", target_name)

dft = Drift_thresholder()
data = dft.fit_transform(data)

#Tuning
# Declare an optimiser. Scoring possibilities for classification lie in :
# {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"}
opt = Optimiser(scoring='accuracy', n_folds=3)
opt.evaluate(None, data)

# Space of hyperparameters
# The keys must respect the following syntax : "enc__param".
#   "enc" = "ne" for na encoder
#   "enc" = "ce" for categorical encoder
#   "enc" = "fs" for feature selector [OPTIONAL]
#   "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
#   "enc" = "est" for the final estimator
#   "param" : a correct associated parameter for each step.
#   Ex: "max_depth" for "enc"="est", ...
# The values must respect the syntax: {"search":strategy,"space":list}
#   "strategy" = "choice" or "uniform". Default = "choice"
#   list : a list of values to be tested if strategy="choice".
#   Else, list = [value_min, value_max].
# Available strategies for ne_numerical_strategy are either an integer, a float
#   or in {'mean', 'median', "most_frequent"}
# Available strategies for ce_strategy are:
#   {"label_encoding", "dummification", "random_projection", entity_embedding"}
space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
         'ce__strategy': {"search": "choice",
                          "space": ["label_encoding",
                                    "random_projection",
                                    "entity_embedding"]},
         'fs__threshold': {"search": "uniform",
                           "space": [0.01, 0.3]},
         'est__max_depth': {"search": "choice",
                            "space": [3, 4, 5, 6, 7]}

         }

# Optimises hyper-parameters of the whole Pipeline with a given scoring
# function. Algorithm used to optimize : Tree Parzen Estimator.
#
# IMPORTANT : Try to avoid dependent parameters and to set one feature
# selection strategy and one estimator strategy at a time.
best = opt.optimise(space, data, 15)

# Make prediction and save the results in save folder.
prd = Predictor()
prd.fit_predict(best, data)

kaggle competitions download -c petfinder-adoption-predictionkaggle competitions download -c petfinder-adoption-url = 
u$ brew install jenv$!val scoreFn = new OpWorkflowRunnerLocal(workflow).scoreFunction(opParams)
val scoreFn = new OpWorkflowRunnerLocal(workflow).scoreFunction(opParams)
valInstall GraphViz & PyDot

In [None]:
#Install Graphviz and pydot.
!apt-get -qq install -y graphviz && pip install pydot
import pydot

In [None]:
#Install Cartopy.
!pip install cartopy
import cartopy

**TPOT**

Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science

[link text](https://)Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science. Proceedings of GECCO 2016, pages 485-492.

> Developed by Randal S. Olson and others at the University of Pennsylvania.



**Install TPOT:**

> 



In [None]:
!pip install tpot

**Classification using TPOT:**

Wine dataset

In [None]:
#TPOT
from tpot import TPOTClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
wine=load_wine()
#Perform a train test split.
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, train_size=0.75, test_size=0.25)
#TPOT classifier
tpot=TPOTClassifier(generations=99, population_size=99, mutation_rate=0.7, crossover_rate=0.3, random_state=11, cv=5, subsample=0.98, verbosity=2, n_jobs=-2, max_eval_time_mins=0.00000001, config_dict='TPOT light', memory='áuto', log_file='tpot_digits_logs')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_wine_pipeline.py')

#TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is:
tpot_config = {
    'tpot.nn.PytorchLRClassifier': {
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.]
    }
}

Digits dataset

In [None]:
#TPOT
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
wine = load_digits()
#Perform a train test split.
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, train_size=0.75, test_size=0.25)
#TPOT classifier
tpot=TPOTClassifier(generations=99, population_size=99, mutation_rate=0.7, crossover_rate=0.3, random_state=1110, cv=5, subsample=0.98, verbosity=2, n_jobs=-2, max_eval_time_mins=0.00000001, config_dict='TPOT light', memory='áuto', log_file='tpot_digits_logs')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export("tpot_digits_pipeline.py")

#TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is:
tpot_config = {
    'tpot.nn.PytorchLRClassifier': {
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.]
    }
}

Diabetes dataset

In [None]:
#TPOT
from tpot import TPOTClassifier
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
diabetes = load_diabetes()
#Perform a train test split.
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, train_size=0.75, test_size=0.25)
#TPOT classifier
tpot=TPOTClassifier(generations=99, population_size=99, mutation_rate=0.7, crossover_rate=0.3, random_state=131, cv=5, subsample=0.999999999, verbosity=2, n_jobs=-2, max_eval_time_mins=0.00000001, config_dict='TPOT light', memory='áuto', log_file='tpot_diabetes_logs')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_diabetes_pipeline.py')

#TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is:
tpot_config = {
    'tpot.nn.PytorchLRClassifier': {
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.]
    }
}

Iris dataset

In [None]:
#TPOT
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
diabetes = load_iris()
#Perform a train test split
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, train_size=0.75, test_size=0.25)
#TPOT classifier
tpot=TPOTClassifier(generations=99, population_size=99, mutation_rate=0.7, crossover_rate=0.3, random_state=131, cv=5, subsample=0.98, verbosity=2, n_jobs=-2, max_eval_time_mins=0.000001, config_dict='TPOT light', memory='áuto', log_file='tpot_iris_logs')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_iris_pipeline.py')

#TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is:
tpot_config = {
    'tpot.nn.PytorchLRClassifier': {
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.]
    }
}



Latest India Covid-19 status dataset




In [None]:
#TPOT
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
#Import
import sklearn.model_selection
import sklearn.metrics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable

#Specify the URL for web-scraping.
url = "https://www.kaggle.com/sudalairajkumar/covid19-in-india"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
#Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
#Now convert the data into a pandas dataframe for further processing.
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)
state_data.head()
#Scraped data columns are of ‘string’ datatype.
#Convert them into ‘int' datatype.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"] = state_data["Deceased"].map(int)
India_covid_status = state_data
#Perform a train test split.
X_train, X_test, y_train, y_test = train_test_split(India_covid_status.data, India_covid_status.target, train_size=0.75, test_size=0.25)
#TPOT classifier
tpot=TPOTClassifier(generations=99, population_size=99, mutation_rate=0.7, crossover_rate=0.3, random_state=131, cv=5, subsample=0.98, verbosity=2, n_jobs=-2, max_eval_time_mins=0.00000001, config_dict='TPOT light', memory='áuto', log_file='tpot_India_covid_status_logs')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_India_covid_status_pipeline.py')

#TPOT to only use a PyTorch-based logistic regression classifier as its main estimator is:
tpot_config = {
    'tpot.nn.PytorchLRClassifier': {
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.]
    }
}

California housing dataset

train_data = /content/sample_data/california_housing_train.csv;
test_data = /content/sample_data/california_housing_test.csv

In [None]:
import pandas as pd
cal_house_train_data=pd.read_csv("/content/sample_data/california_housing_train.csv")
cal_house_test_data=pd.read_csv("/content/sample_data/california_housing_test.csv")

cal_house_data = pd.merge(cal_house_train_data, cal_house_test_data)

In [None]:
#Regression
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

#Perform a train test split.
X_train, X_test, y_train, y_test = train_test_split(cal_house_data.data, cal_house_data.target, train_size=0.75, test_size=0.25)
tpot_reg=TPOTRegressor(generations=99, population_size=99, mutation_rate=0.75, crossover_rate=0.25, cv=5, subsample=0.95, verbosity=2, n_jobs=-2, scoring='r2', random_state=21, max_eval_time_mins=0.5, config_dict='TPOT light', memory='áuto', log_file='tpot_cal_house_data_logs')
tpot_reg.fit(X_train, y_train)
print(tpot_reg.score(X_test, y_test))
tpot_reg.export('tpot_california_house_prices_pipeline.py')

Latest India Covid-19 statewise status dataset:



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests 
from bs4 import BeautifulSoup 
import geopandas as gpd
from prettytable import PrettyTable

# Specify the URL for the offical ministry of health website.
url = "https://www.mygov.in/corona-data/covid19-statewise-status/" 

# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content

# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")

# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace("\n", "") for x in row] 

stats = [] # Initialize stats.
all_rows = soup.find_all("tr") # Find all the table rows.

for row in all_rows: 
    stat = extract_contents(row.find_all("td")) # Find all data cells.
    # Notice that the data that we require is now a list of length 5.
    if len(stat) == 5: 
        stats.append(stat)

#Now convert the data into a pandas dataframe for further processing.
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)

#Converting the 'string' data to 'int'.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"]  = state_data["Deceased"].map(int)

# Pretty table representation
table = PrettyTable()
table.field_names = (new_cols)
for i in stats:
    table.add_row(i)
table.add_row(["","Total", 
               sum(state_data["Confirmed"]), 
               sum(state_data["Recovered"]), 
               sum(state_data["Deceased"])])
print(table)

#Use barplot to show total confirmed cases Statewise. 
sns.set_style("ticks")
plt.figure(figsize = (15,10))
plt.barh(state_data["States/UT"], state_data["Confirmed"].map(int),
         align = "center", color = "lightblue", edgecolor = "blue")
plt.xlabel("No. of Confirmed cases", fontsize = 18)
plt.ylabel("States/UT", fontsize = 18)
plt.gca().invert_yaxis() # This is to maintain the order in which the states appear.
plt.xticks(fontsize = 14) 
plt.yticks(fontsize = 14)
plt.title("Total Confirmed Cases Statewise", fontsize = 20)

for index, value in enumerate(state_data["Confirmed"]):
    plt.text(value, index, str(value), fontsize = 12, verticalalignment = "center")
plt.show()  

#Use donut chart representing nationwide total confirmed, cured and deceased cases.
group_size = [sum(state_data["Confirmed"]), 
              sum(state_data["Recovered"]), 
              sum(state_data["Deceased"])]

group_labels = ["Confirmed\n" + str(sum(state_data["Confirmed"])), 
                "Recovered\n" + str(sum(state_data["Recovered"])), 
                "Deceased\n"  + str(sum(state_data["Deceased"]))]
custom_colors = ["skyblue", "yellowgreen", "tomato"]

plt.figure(figsize = (5,5))
plt.pie(group_size, labels = group_labels, colors = custom_colors)
central_circle = plt.Circle((0,0), 0.5, color = "white")
fig = plt.gcf()
fig.gca().add_artist(central_circle)
plt.rc("font", size = 12) 
plt.title("Nationwide total Confirmed, Recovered and Deceased Cases", fontsize = 16)
plt.show()

# Read the state wise shapefile of India in a GeoDataFrame and preview it.
map_data = gpd.read_file("Indian_States.shp")
map_data.rename(columns = {"st_nm":"States/UT"}, inplace = True)
map_data.head()

# Correct the name of states in the map dataframe. 
map_data["States/UT"] = map_data["States/UT"].str.replace('&', 'and')
map_data["States/UT"].replace("Arunanchal Pradesh", "Arunachal Pradesh", inplace = True)
map_data["States/UT"].replace("Telangana", "Telengana", inplace = True)
map_data["States/UT"].replace("NCT of Delhi", "Delhi", inplace = True)

# Merge both the dataframes - state_data and map_data.
merged_data = pd.merge(map_data, state_data, how = "left", on = "States/UT")
merged_data.fillna(0, inplace = True)
merged_data.drop("Sr.No", axis = 1, inplace = True)
merged_data.head()

# Create figure and axes for Matplotlib and set the title.
fig, ax = plt.subplots(1, figsize=(20, 12))
ax.axis('off')
ax.set_title('Covid-19 Statewise Data - Confirmed Cases', fontdict = {'fontsize': '25', 'fontweight' : '3'})
# Plot the figure.
merged_data.plot(column = 'Confirmed', cmap='YlOrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend = True)
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.export_utils import set_param_recursive
#TPOT
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
#Perform the data and metric imports
import sklearn.model_selection
import sklearn.metrics
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable

#Specify the URL for Web Scraping.
url = "https://www.mygov.in/corona-data/covid19-statewise-status/"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
# Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
#Now convert the data into a pandas dataframe for further processing.
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)
state_data.head()
#Scraped data columns are of ‘string’ datatype.
#Convert them into ‘int' datatype.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"] = state_data["Deceased"].map(int)

import csv
data = state_data
data = csv.reader(data)  
print(data)
# NOTE: Make sure that the outcome column is labeled 'target' in the data file.
tpot_data = pd.read_csv(data, sep=',', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 0.9826086956521738
exported_pipeline = make_pipeline(
    Normalizer(norm="l2"),
    KNeighborsClassifier(n_neighbors=5, p=2, weights="distance")
)
# Fix random state for all the steps in exported pipeline.
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

**TPOT Regressor**

In [None]:
from sklearn.datasets import load_boston
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.export_utils import set_param_recursive
#TPOT
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
# data and metric imports
import sklearn.model_selection
import sklearn.metrics
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable

#Specify the URL for Web Scraping.
url = "https://www.mygov.in/corona-data/covid19-statewise-status/"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
# Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
#Now convert the data into a pandas dataframe for further processing.
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)
state_data.head()
#Scraped data columns are of ‘string’ datatype.
#Convert them into ‘int' datatype.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"] = state_data["Deceased"].map(int)
#Perform a train test split.
X_train, X_test, y_train, y_test = train_test_split(state_data.data, state_data.target, train_size=0.75, test_size=0.25)
tpot_reg=TPOTRegressor(generations=99, population_size=99, mutation_rate=0.75, crossover_rate=0.25, cv=7, subsample=0.95, verbosity=2, n_jobs=-2, scoring='r2', random_state=21, max_eval_time_mins=0.5, config_dict='TPOT light', memory='áuto', log_file='tpot_state_data_logs')
tpot_reg.fit(X_train, y_train)
print(tpot_reg.score(X_test, y_test))
tpot_reg.export("tpot_state_data_pipeline.py")

In [None]:
from sklearn.datasets import load_boston
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
house_data = load_boston()
#Perform a train test split.
X_train, X_test, y_train, y_test = train_test_split(house_data.data, house_data.target, train_size=0.75, test_size=0.25)
tpot_reg=TPOTRegressor(generations=99, population_size=99, mutation_rate=0.75, crossover_rate=0.25, cv=5, subsample=0.95, verbosity=2, n_jobs=-2, scoring='r2', random_state=21, max_eval_time_mins=0.5, config_dict='TPOT light', memory='áuto', log_file='tpot_boston_data_logs')
tpot_reg.fit(X_train, y_train)
print(tpot_reg.score(X_test, y_test))
tpot_reg.export('tpot_reg_house_prices_pipeline.py')

**Neural network classifier using TPOT-NN:**

In [None]:
#Perform weather forecast using H2O.
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.climate.gov/maps-data/datasets"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    weather_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = weather_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(weather_data.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=10, generations=10)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_weather_data_pipeline.py')

In [None]:
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering/data"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    chaii_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = chaii_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=13131)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=1000,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(chaii_data.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=10, generations=10)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_weather_data_pipeline.py')

Music dataset using TPOT-NN:

In [None]:
#Musicnet dataset
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.kaggle.com/imsparsh/musicnet-dataset?select=musicnet_metadata.csv"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    music_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = music_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(music_data.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=10, generations=10)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_music_data_pipeline.py')

In [None]:
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "http://www.cs.ubc.ca/labs/beta/Projects/autoweka/datasets/"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()

    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    autoweka_datasets[each_new_col] = stats_data[each_new_col].map(int)
    X, y = autoweka_datasets
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    
    #Specify the predictor set and response.
    x = list(autoweka_datasets.columns)
    x

    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)

    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=10, generations=10)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_autoweka_datasets_pipeline.py')

In [None]:
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.climate.gov/maps-data/datasets"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()

    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    covid19_india_data[each_new_col] = covid19_india_data[each_new_col].map(int)
    X, y = covid19_india_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    
    #Specify the predictor set and response.
    x = list(covid19_india_data.columns)
    x

    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    
    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=10, generations=10)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_covid19_india_data_pipeline.py')
    fig, ax = plt.subplots(1, figsize=(20, 12))
    ax.axis(‘off’)
    ax.set_title(‘Covid-19 Statewise Data — Confirmed Cases’, 
                fontdict =  {‘fontsize’: ‘25’, ‘fontweight’ : ‘3’})
    merged_data.plot(column = ‘Confirmed’, cmap=’YlOrRd’, 
                    linewidth=0.8, ax=ax, edgecolor=’0.8', 
                    legend = True)
    plt.show()

In [None]:
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://docs.gitlab.com/ee/development/value_stream_analytics.html#data-collector"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    covid19_india_data[each_new_col] = covid19_india_data[each_new_col].map(int)
    X, y = covid19_india_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(covid19_india_data.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    perf = model.model_performance(test)
    print(perf.__class__)
    #Area Under the ROC Curve (AUC)
    perf.auc()
    perf.mse()
    #Cross-validated Performance
    cvmodel = H2OGradientBoostingEstimator(distribution='bernoulli',
                                       ntrees=1000,
                                       max_depth=4,
                                       learn_rate=0.1,
                                       nfolds=5)
    cvmodel.train(x=x, y=y, training_frame=data)
    print(cvmodel.auc(train=True))
    print(cvmodel.auc(xval=True))
    #Grid Search
    #ntrees: Number of trees
    #max_depth: Maximum depth of a tree
    #learn_rate: Learning rate in the GBM
    ntrees_opt = [5,50,100]
    max_depth_opt = [2,3,5]
    learn_rate_opt = [0.1,0.2]
    hyper_params = {'ntrees': ntrees_opt, 
                    'max_depth': max_depth_opt,
                    'learn_rate': learn_rate_opt}

    #Define an "H2OGridSearch" object by specifying the algorithm (GBM) and the hyper parameters.
    from h2o.grid.grid_search import H2OGridSearch
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params = hyper_params)
    gs.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(gs)

    # Print out the AUC for all of the models.
    auc_table = gs.sort_by('auc(valid=True)',increasing=False)
    print(auc_table)
    #Get the best model in terms of AUC.
    best_model = h2o.get_model(auc_table['Model Id'][0])
    best_model.auc() 
    #Generate predictions on the test set using the "best" model, and evaluate the test set AUC.
    best_perf = best_model.model_performance(test)
    best_perf.auc()  

    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=10, generations=10)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_data_pipeline.py')
    fig, ax = plt.subplots(1, figsize=(20, 12))
    ax.axis(‘off’)
    ax.set_title(‘Covid-19 Statewise Data — Confirmed Cases’, 
                fontdict =  {‘fontsize’: ‘25’, ‘fontweight’ : ‘3’})
    merged_data.plot(column = ‘Confirmed’, cmap=’YlOrRd’, 
                    linewidth=0.8, ax=ax, edgecolor=’0.8', 
                    legend = True)
    plt.show()

In [None]:
#Kaggle dataset
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
url = "https://www.kaggle.com/kiva/data-science-for-good-kiva-crowdfunding"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()

    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    kaggle_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = kaggle_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=131311)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    
    #Specify the predictor set and response.
    x = list(kaggle_data.columns)
    x

    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    
    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=100, generations=100)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_kaggle_dataset_pipeline.py')

In [None]:
from tpot import TPOTClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

X, y = make_blobs(n_samples=100, centers=2, n_features=3, random_state=424)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                     verbosity=2, population_size=10, generations=10)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
clf.export('tpot_nn_data_pipeline.py')

**NOTE:** Turns out TPOT cannot solve multi label regression problems at this time as below;

In [None]:
#Latest India Covid-19 statewise status
from tpot import TPOTClassifier
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.export_utils import set_param_recursive
#TPOT
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
# Perform the data and metric imports.
import sklearn.model_selection
import sklearn.metrics
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable

#Specify the URL for the Web Scraping.
url = "https://www.mygov.in/corona-data/covid19-statewise-status/"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
# Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)

#now convert the data into a pandas dataframe for further processing
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)
state_data.head()

#Scraped data columns are of ‘string’ datatype.
#Convert them into ‘int' datatype.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"] = state_data["Deceased"].map(int)

def getNumbers():
    return 'one', 'two'
one, two = getNumbers()

X, y = getNumbers()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)
#TPOT-NN
clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                     verbosity=2, population_size=10, generations=10)

#NOTE: Turns out TPOT cannot solve multi label regression problems at this time
'''clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
clf.export('tpot_nn_state_data_pipeline.py')
'''

In [None]:
#Import scikit-learn.
import sklearn

**Auto-Sklearn**

arXiv:2007.04074v2 [cs.LG]

arXiv:2007.04074 

@article{ASKL2,
   title = {Auto-Sklearn 2.0},
   author = {Feurer, Matthias and Eggensperger, Katharina and
             Falkner, Stefan and Lindauer, Marius and Hutter, Frank},
   booktitle = {Advances in Neural Information Processing Systems 28},
   year = {2020},
   journal = {arXiv:2007.04074 [cs.LG]},
}

**Install Auto-Sklearn**

In [None]:
!python3 -m pip install --upgrade pip

In [None]:
!pip3 install --upgrade pandas

In [None]:
!pip3 install auto-sklearn

In [None]:
!pip3 install --upgrade scipy
!pip3 install --upgrade auto-sklearn
!pip install auto-sklearn==0.10.0
!pip install --upgrade pip
!sudo apt-get install build-essential swig 
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install 
!pip install auto-sklearn==0.10.0
!pip install matplotlib-venn
!apt-get -qq install -y libfluidsynth1

Install 7zip reader libarchive

In [None]:
!apt-get -qq install -y libarchive-dev && pip install -U libarchive
import libarchive

In [None]:
import autosklearn.classification
cls = autosklearn.classification.AutoSklearnClassifier()
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
X, y = sklearn.datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

cls.fit(X_train, y_train)
predictions = cls.predict(X_test)

import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
if __name__ == "__main__":
    X, y = sklearn.datasets.load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=1)
    automl = autosklearn.classification.AutoSklearnClassifier()
    #Fit.
    automl.fit(X_train, y_train)
    #Predict.
    y_hat = automl.predict(X_test)
    #Print the accuracy score.
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

In [None]:
import autosklearn.classification
cls = autosklearn.classification.AutoSklearnClassifier()
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
X, y = sklearn.datasets.load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

cls.fit(X_train, y_train)
predictions = cls.predict(X_test)

import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
if __name__ == "__main__":
    X, y = sklearn.datasets.load_digits(return_X_y=True)
    X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=1)
    automl = autosklearn.classification.AutoSklearnClassifier()
    #Fit.
    automl.fit(X_train, y_train)
    #Predict.
    y_hat = automl.predict(X_test)
    #Print the accuracy score.
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

In [None]:
# Make a GET request to fetch the raw HTML content.
web_content = requests.get("https://www.kaggle.com/kiva/data-science-for-good-kiva-crowdfunding").content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 

    # Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
    #Now convert the data into a pandas dataframe for further processing.
    new_cols=[]
    ozone_data = pd.DataFrame(data = stats, columns = new_cols)
    ozone_data.head()
    #Scraped data columns are of ‘string’ datatype.
    #Convert them into ‘int' datatype.
    ozone_data[new_cols] = ozone_data[new_cols].map(int)
    X, y = ozone_data
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=110011)

    #Auto-PyTorch
    autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=30,
                                    max_budget=999999999*100000)
    
    autoPyTorch.fit(X_train, y_train, validation_split=0.3)
    y_pred = autoPyTorch.predict(X_test)
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))
    #Fit.
    clf = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
    #Predict.
    y_pred = clf.predict(X_test)
    #Print the confusion matrix and classification report.
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))

**ConfigSpace**

@article{
    title   = {BOAH: A Tool Suite for Multi-Fidelity Bayesian Optimization & Analysis of Hyperparameters},
    author  = {M. Lindauer and K. Eggensperger and M. Feurer and A. Biedenkapp and J. Marben and P. Müller and F. Hutter},
    journal = {arXiv:1908.06756 {[cs.LG]}},
    date    = {2019},
}

**Install ConfigSpace:**

In [None]:
!pip install ConfigSpace

In [None]:
import ConfigSpace as CS
cs = CS.ConfigurationSpace(seed=1234)

In [None]:
#choose hyperparameter alpha
import ConfigSpace.hyperparameters as CSH
alpha = CSH.UniformFloatHyperparameter(name='alpha', lower=0, upper=1)
#create a ConfigurationSpace object
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
cs = CS.ConfigurationSpace(seed=1234)

a = CSH.UniformIntegerHyperparameter('a', lower=10, upper=100, log=False)
b = CSH.CategoricalHyperparameter('b', choices=['red', 'green', 'blue'])
cs.add_hyperparameters([a, b])
cs.sample_configuration()
#Add ordinal hyper-parameter.
ord_hp = CSH.OrdinalHyperparameter('ordinal_hp', sequence=['10', '20', '30'])
cs.add_hyperparameter(ord_hp)
#Sample a configuration from the ConfigurationSpace object.
cs.sample_configuration()

**Install Auto-PyTorch**

In [None]:
!pip install autopytorch

In [None]:
from autoPyTorch import AutoNetClassification
#Perform the data and metric imports
import sklearn.model_selection
import sklearn.metrics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable

#Specify the URL for Web Scraping.
url = "https://www.kaggle.com/sudalairajkumar/covid19-in-india"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]

# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td'))

# Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
#Now convert the data into a pandas dataframe for further processing.
new_cols = ["Sr.No", "States/UT","Confirmed","Recovered","Deceased"]
state_data = pd.DataFrame(data = stats, columns = new_cols)
state_data.head()

#Scraped data columns are of ‘string’ datatype.
#Convert them into ‘int' datatype.
state_data["Confirmed"] = state_data["Confirmed"].map(int)
state_data["Recovered"] = state_data["Recovered"].map(int)
state_data["Deceased"] = state_data["Deceased"].map(int)

X, y = state_data
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

#Auto-PyTorch
autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=30,
                                    max_budget=999999999*100000)
#Fit.
autoPyTorch.fit(X_train, y_train, validation_split=0.3)
#Predict.
y_pred = autoPyTorch.predict(X_test)
#Print the accuracy score.
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))

In [None]:
from autoPyTorch import AutoNetClassification

# Perform the data and metric imports
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
X, y = sklearn.datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

#Auto-PyTorch
autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=30,
                                    max_budget=999999999*100000)
#Fit.
autoPyTorch.fit(X_train, y_train, validation_split=0.3)
#Predict.
y_pred = autoPyTorch.predict(X_test)
#Print the accuracy score.
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))

In [None]:
from autoPyTorch import AutoNetClassification

# Perform the data and metric imports
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
X, y = sklearn.datasets.load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

#Auto-PyTorch
autoPyTorch = AutoNetClassification("tiny_cs",  # Config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=30,
                                    max_budget=999999999*100000)
#Fit.
autoPyTorch.fit(X_train, y_train, validation_split=0.3)
#Predict.
y_pred = autoPyTorch.predict(X_test)
#Print the accuracy score.
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))

In [None]:
from autoPyTorch import AutoNetClassification
# Perform the data and metric imports
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
X, y = sklearn.datasets.load_linnerud(return_X_y=True)
X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

#Auto-PyTorch
autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=30,
                                    max_budget=999999999*100000)
#Fit.
autoPyTorch.fit(X_train, y_train, validation_split=0.3)
#Predict.
y_pred = autoPyTorch.predict(X_test)
#Print the accuracy score.
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))

In [None]:
# Perform the data and metric imports.
import sklearn.model_selection
import sklearn.metrics
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#Specify the URL.
url = "https://www.kaggle.com/brsdincer/ozone-tendency-new-data-20182021-nasa"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]

# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')

for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
    # Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
    #Now convert the data into a pandas dataframe for further processing.
    new_cols=[]
    ozone_data = pd.DataFrame(data = stats, columns = new_cols)
    ozone_data.head()

    #Scraped data columns are of ‘string’ datatype.
    #Convert them into ‘int' datatype.
    ozone_data[new_cols] = ozone_data[new_cols].map(int)
    X, y = ozone_data
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=110011)

    #Auto-PyTorch
    autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=30,
                                    max_budget=999999999*100000)
    
    #Fit. Predict. Get the accuracy score.
    autoPyTorch.fit(X_train, y_train, validation_split=0.3)
    y_pred = autoPyTorch.predict(X_test)
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))


In [None]:
# Perform the data and metric imports.
import sklearn.model_selection
import sklearn.metrics
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#Specify the URL.
url = "https://www.kaggle.com/kiva/data-science-for-good-kiva-crowdfunding"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')

for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
    # Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)

    #Now convert the data into a pandas dataframe for further processing.
    new_cols=[]
    coll_data = pd.DataFrame(data = stats, columns = new_cols)
    coll_data.head()

    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    coll_data[new_cols] = coll_data[new_cols].map(int)
    X, y = coll_data
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=11001100)

    #Auto-PyTorch
    autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=333,
                                    max_budget=999999999*100000)
    
    #Fit. Predict. Get the accuracy score.
    autoPyTorch.fit(X_train, y_train, validation_split=0.3)
    y_pred = autoPyTorch.predict(X_test)
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))


In [None]:
# Perform the data and metric imports.
import sklearn.model_selection
import sklearn.metrics
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
from prettytable import PrettyTable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#Specify the URL.
url = "https://www.kaggle.com/nipunarora8/age-gender-and-ethnicity-face-data-csv"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
    stat = extract_contents(row.find_all('td')) 
    # Notice that the data that we require is now a list of length 5.
    if len(stat) == 5:
        stats.append(stat)
    #Now convert the data into a pandas dataframe for further processing.
    new_cols=[]
    face_data = pd.DataFrame(data = stats, columns = new_cols)
    face_data.head()
    #Scraped data columns are of ‘string’ datatype.
    #Convert them into ‘int' datatype.
    face_data[new_cols] = face_data[new_cols].map(int)
    X, y = face_data
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=11001100)

    #Auto-PyTorch
    autoPyTorch = AutoNetClassification("tiny_cs",  # config preset
                                    log_level='info',
                                    max_runtime=999999999**10000000,
                                    min_budget=333,
                                    max_budget=999999999*100000)
    
    #Fit. Predict. Get the accuracy score. 
    autoPyTorch.fit(X_train, y_train, validation_split=0.3)
    y_pred = autoPyTorch.predict(X_test)
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))


Install AutoGluon

In [None]:
#Install AutoGluon.
python3 -m pip install -U pip
python3 -m pip install -U setuptools wheel
python3 -m pip install -U "mxnet<2.0.0"
python3 -m pip install autogluon

In [None]:
#Image Prediction with AutoGluon
#Import AutoGluon.
%matplotlib inline
import autogluon.core as ag
from autogluon.vision import ImageDataset
import pandas as pd
#Celeb faces (celebA) dataset
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
#Specify the URL.
url = "https://www.kaggle.com/c/petfinder-adoption-prediction/data"

# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')

for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    olympics2021_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1221)
    csv_file_list=["BreedLabels.csv", "ColorLabels.csv", "PetFinder-BreedLabels.csv", 
                   "PetFinder-ColorLabels.csv", "PetFinder-StateLabels.csv", "PetFinder-StateLabels.csv (285 B)", 
                   "StateLabels.csv", "breed_labels.csv", "color_labels.csv", "state_labels.csv"]
    
    for each_csv_file in csv_file_list:
      csv_file = ag.utils.download(each_csv_file)
      df = pd.read_csv(csv_file)
      df.head()
      df = ImageDataset.from_csv(csv_file)
      df.head()
      train_data, _, test_data = ImageDataset.from_folders(each_csv_file, train='train', test='test')
      print('train #', len(train_data), 'test #', len(test_data))
      train_data.head()

      #Load the splits with from_folder.
      root = os.path.join(os.path.dirname(train_data.iloc[0]['image']), '..')
      all_data = ImageDataset.from_folder(root)
      all_data.head()

      #Split the dataset.
      train, val, test = all_data.random_split(val_size=0.1, test_size=0.1)
      print('train #:', len(train), 'test #:', len(test))

      #Convert a list of images to dataset.
      pets = ag.utils.download(each_csv_file)
      pets = ag.utils.unzip(pets)
      image_list = [x for x in os.listdir(os.path.join(pets, 'images')) if x.endswith('jpg')]
      new_data = ImageDataset.from_name_func(image_list, label_fn, root=os.path.join(os.getcwd(), pets, 'images'))
      new_data

    #Visualize the images.
    new_data.show_images()

    #Image prediction
    import autogluon.core as ag
    from autogluon.vision import ImagePredictor, ImageDataset
    train_dataset, _, test_dataset = ImageDataset.from_folders("img_align_pets.zip")
    print(train_dataset)

    #Fit a classifier.
    predictor = ImagePredictor()

    # Since the original dataset does not provide validation split, the `fit` function splits it randomly with 90/10 ratio.
    predictor.fit(train_dataset, hyperparameters={'epochs': 1000})

    #The best Top-1 accuracy achieved on the validation set is:
    fit_result = predictor.fit_summary()
    print('Top-1 train acc: %.3f, val acc: %.3f' %(fit_result['train_acc'], fit_result['valid_acc']))
    
    #Predict on a new image.
    image_path = test_dataset.iloc[0]['image']
    result = predictor.predict(image_path)
    print(result)
    bulk_result = predictor.predict(test_dataset)
    print(bulk_result)

    #Generate image features with a classifier.
    image_path = test_dataset.iloc[0]['image']
    feature = predictor.predict_feature(image_path)
    print(feature)

    #Validation and test top-1 accuracy is:
    test_acc = predictor.evaluate(test_dataset)
    print('Top-1 test acc: %.3f' % test_acc['top1'])

    #Save and load the classifiers.
    filename = 'predictor.ag'
    predictor.save(filename)
    predictor_loaded = ImagePredictor.load(filename)

    # Use predictor_loaded as usual.
    result = predictor_loaded.predict(image_path)
    print(result)

    #Use AutoGluon to produce an ImagePredictor to classify images.
    import autogluon.core as ag
    from autogluon.vision import ImagePredictor, ImageDataset
    train_data, _, test_data = ImageDataset.from_folders("img_align_celeba.zip")
    model = ag.Categorical('resnet18_v1b', 'mobilenetv3_small')
    model_list = ImagePredictor.list_models()

    #Specify the training hyper-parameters.
    batch_size = 8
    lr = ag.Categorical(1e-2, 1e-3)

    #Bayesian Optimization
    hyperparameters={'model': model, 'batch_size': batch_size, 'lr': lr, 'epochs': 2}
    predictor = ImagePredictor()
    predictor.fit(train_data, time_limit=60*10, hyperparameters=hyperparameters,
                  hyperparameter_tune_kwargs={'searcher': 'bayesopt', 'num_trials': 2})
    print('Top-1 val acc: %.3f' % predictor.fit_summary()['valid_acc'])

    #Load the test dataset and evaluate.
    results = predictor.evaluate(test_data)
    print('Test acc on hold-out data:', results)

In [None]:
#Global Superstore Orders 2016 dataset
#Tabular prediction with AutoGluon:

#Predict columns in a table.
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset('Global Superstore Orders 2016.csv')
subsample_size = 999000000000  # Subsample subset of data for faster demo, try setting this to much larger values.
train_data = train_data.sample(n=subsample_size, random_state=0)
train_data.head()
label = 'class'
print("Summary of class variable: \n", train_data[label].describe())

#Use AutoGluon to train multiple models.
save_path = 'agModels-predictClass'  # Specifies folder to store trained models.
predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
test_data = TabularDataset('Global Superstore Orders 2016.csv')
y_test = test_data[label]  # Values to predict.
test_data_nolab = test_data.drop(columns=[label])  # Delete label column to prove we're not cheating.
test_data_nolab.head()

#predictor = TabularPredictor.load(save_path)
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
predictor.leaderboard(test_data, silent=True)

from autogluon.tabular import TabularPredictor
predictor = TabularPredictor(label=label).fit(train_data='Global Superstore Orders 2016.csv')

#.fit() returns a predictor object.
pred_probs = predictor.predict_proba(test_data_nolab)
pred_probs.head(5)

#Summarize what happened during fit.
results = predictor.fit_summary(show_plot=True)
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)
predictor.leaderboard(test_data, silent=True)
predictor.predict(test_data, model='LightGBM')

#Maximize the predictive performance.
time_limit = 11  
metric = 'roc_auc'  # Specify the evaluation metric here.

predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality')
predictor.leaderboard(test_data, silent=True)

In [None]:
#Tabular prediction with AutoGluon:

#Predict columns in a table.
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset('pueblosMagicos.csv')
subsample_size = 999000000000  # Subsample subset of data for faster demo, try setting this to much larger values.
train_data = train_data.sample(n=subsample_size, random_state=0)
train_data.head()
label = 'class'
print("Summary of class variable: \n", train_data[label].describe())

#Use AutoGluon to train multiple models.
save_path = 'agModels-predictClass'  # specifies folder to store trained models
predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
test_data = TabularDataset('pueblosMagicos.csv')
y_test = test_data[label]  # values to predict
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

#predictor = TabularPredictor.load(save_path)
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
predictor.leaderboard(test_data, silent=True)

from autogluon.tabular import TabularPredictor
predictor = TabularPredictor(label=label).fit(train_data='pueblosMagicos.csv')

#.fit() returns a predictor object.
pred_probs = predictor.predict_proba(test_data_nolab)
pred_probs.head(5)

#Summarize what happened during fit.
results = predictor.fit_summary(show_plot=True)
print("AutoGluon infers problem type is: ", predictor.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor.feature_metadata)
predictor.leaderboard(test_data, silent=True)
predictor.predict(test_data, model='LightGBM')

#Maximizing predictive performance.
time_limit = 11  
metric = 'roc_auc'  # Specify the evaluation metric here.
predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality')
predictor.leaderboard(test_data, silent=True)

#Regression (predict numeric table columns)
pueblo_column = 'PUEBLO'
print("Summary of PUEBLO variable: \n", train_data[pueblo_column].describe())

predictor_pueblo = TabularPredictor(label=pueblo_column, path="agModels-predictAge").fit(train_data, time_limit=60)
performance = predictor_pueblo.evaluate(test_data)

#See the per-model performance.
predictor_pueblo.leaderboard(test_data, silent=True)

In [None]:
#All NeurIPS (NIPS) Papers dataset
import requests
from bs4 import BeautifulSoup
# Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
#Specify the URL.
url = "https://www.kaggle.com/rowhitswami/nips-papers-1987-2019-updated"
# Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
# Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
# Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  # Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    weather_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = weather_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=1313)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    
    #Specify the predictor set and response.
    x = list(weather_data.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)
    
    #TPOT-NN
    from tpot import TPOTClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

    clf = TPOTClassifier(config_dict='TPOT NN', template='Selector-Transformer-PytorchLRClassifier',
                        verbosity=2, population_size=10, generations=10)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    clf.export('tpot_nn_weather_data_pipeline.py')

AutoKeras

In [None]:
#Install AutoKeras
!pip install autokeras

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
import autokeras as ak
input_node = ak.ImageInput()
output_node = ak.Normalization()(input_node)
output_node1 = ak.ConvBlock()(output_node)
output_node2 = ak.ResNetBlock(version="v2")(output_node)
output_node = ak.Merge()([output_node1, output_node2])
output_node = ak.ClassificationHead()(output_node)

auto_model = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=100
)

#Prepare data to run the model.
x_train, y_train), (x_test, y_test) = mnist.load_data()
print(x_train.shape)
print(y_train.shape)
print(y_train[:3])

# Feed the AutoModel with training data.
auto_model.fit(x_train[:100], y_train[:100], epochs=1000)
# Predict with the best model.
predicted_y = auto_model.predict(x_test)
# Evaluate the best model with testing data.
print(auto_model.evaluate(x_test, y_test))

#Implement new block.
class SingleDenseLayerBlock(ak.Block):
    def build(self, hp, inputs=None):
        # Get the input_node from inputs.
        input_node = tf.nest.flatten(inputs)[0]
        layer = tf.keras.layers.Dense(
            hp.Int("num_units", min_value=32, max_value=512, step=32)
        )
        output_node = layer(input_node)
        return output_node

# Build the AutoModel.
input_node = ak.Input()
output_node = SingleDenseLayerBlock()(input_node)
output_node = ak.RegressionHead()(output_node)
auto_model = ak.AutoModel(input_node, output_node, overwrite=True, max_trials=100)

# Prepare the data.
num_instances = 100
x_train = np.random.rand(num_instances, 20).astype(np.float32)
y_train = np.random.rand(num_instances, 1).astype(np.float32)
x_test = np.random.rand(num_instances, 20).astype(np.float32)
y_test = np.random.rand(num_instances, 1).astype(np.float32)

# Train the model.
auto_model.fit(x_train, y_train, epochs=1000)
print(auto_model.evaluate(x_test, y_test))

In [None]:
import numpy as np 
import pandas as pd
from autokeras import StructuredDataClassifier
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup
import tensorflow as tf
import autokeras as ak

input_node = ak.ImageInput()
output_node = ak.Normalization()(input_node)
output_node1 = ak.ConvBlock()(output_node)
output_node2 = ak.ResNetBlock(version="v2")(output_node)
output_node = ak.Merge()([output_node1, output_node2])
output_node = ak.ClassificationHead()(output_node)

#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
#Specify the URL.
url = "https://www.kaggle.com/kannan1314/apple-stock-price-all-time?select=Apple.csv"

#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
# Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
  stat = extract_contents(row.find_all('td')) 
  #Notice that the data that we require is now a list of length 5.
  if len(stat) == 5:
    stats.append(stat)
  #Now convert the data into a pandas dataframe for further processing.
  new_cols = []
  for each_new_col in row:
    stats_data = pd.DataFrame(data = stats, columns = each_new_col)
    stats_data.head()
    #Scraped data columns are of ‘string’ datatype so convert them into ‘int' datatype.
    kaggle_data[each_new_col] = stats_data[each_new_col].map(int)
    X, y = stats_data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=11)
    model = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
    #Specify the predictor set and response.
    x = list(stats_train.columns)
    x
    #Train the model.
    model.train(x=x, y=y, training_frame=train, validation_frame=valid)
    print(model)

    auto_model = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=100)
    
    #Prepare data to run the model.
    x_train, y_train), (x_test, y_test) = kaggle_data
    print(x_train.shape)
    print(y_train.shape)
    print(y_train[:3])

    #Feed the AutoModel with training data.
    auto_model.fit(x_train[:100], y_train[:100], epochs=1000)
    # Predict with the best model.
    predicted_y = auto_model.predict(x_test)
    # Evaluate the best model with testing data.
    print(auto_model.evaluate(x_test, y_test))

    #Implement new block.
    class SingleDenseLayerBlock(ak.Block):
        def build(self, hp, inputs=None):
            # Get the input_node from inputs.
            input_node = tf.nest.flatten(inputs)[0]
            layer = tf.keras.layers.Dense(
                hp.Int("num_units", min_value=32, max_value=512, step=32)
            )
            output_node = layer(input_node)
            return output_node

    # Build the AutoModel.
    input_node = ak.Input()
    output_node = SingleDenseLayerBlock()(input_node)
    output_node = ak.RegressionHead()(output_node)
    auto_model = ak.AutoModel(input_node, output_node, overwrite=True, max_trials=100)
   
    # Prepare the data.
    num_instances = 100
    x_train = np.random.rand(num_instances, 20).astype(np.float32)
    y_train = np.random.rand(num_instances, 1).astype(np.float32)
    x_test = np.random.rand(num_instances, 20).astype(np.float32)
    y_test = np.random.rand(num_instances, 1).astype(np.float32)
    
    # Train the model.
    auto_model.fit(x_train, y_train, epochs=1000)
    print(auto_model.evaluate(x_test, y_test))

In [None]:
import argparse
import os

import autokeras as ak
import tensorflow_cloud as tfc
from tensorflow.keras.datasets import mnist


parser = argparse.ArgumentParser(description="Model save path arguments.")
parser.add_argument("--path", required=True, type=str, help="Keras model save path")
args = parser.parse_args()

tfc.run(
    chief_config=tfc.COMMON_MACHINE_CONFIGS["V100_1X"],
    docker_base_image="haifengjin/autokeras:1.0.3",
)

# Prepare the dataset.
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print(x_train.shape)  
print(y_train.shape) 
print(y_train[:3])

# Initialize the ImageClassifier.
clf = ak.ImageClassifier(max_trials=2)
# Search for the best model.
clf.fit(x_train, y_train, epochs=10)
# Evaluate on the testing data.
print("Accuracy: {accuracy}".format(accuracy=clf.evaluate(x_test, y_test)[1]))

clf.export_model().save(os.path.join(args.path, "model.h5"))

In [None]:
import numpy as np 
import pandas as pd
from autokeras import StructuredDataClassifier
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup
import tensorflow as tf
import autokeras as ak

input_node = ak.ImageInput()
output_node = ak.Normalization()(input_node)
output_node1 = ak.ConvBlock()(output_node)
output_node2 = ak.ResNetBlock(version="v2")(output_node)
output_node = ak.Merge()([output_node1, output_node2])
output_node = ak.ClassificationHead()(output_node)

#Import H2O GBM.
from h2o.estimators.gbm import H2OGradientBoostingEstimator
#Specify the URL.
url = "https://www.kaggle.com/c/petfinder-adoption-prediction/data"
#Make a GET request to fetch the raw HTML content.
web_content = requests.get(url).content
#Parse the html content.
soup = BeautifulSoup(web_content, "html.parser")
#Remove any newlines and extra spaces from left and right.
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
#Find all table rows and data cells within.
stats = [] 
all_rows = soup.find_all('tr')
for row in all_rows:
stat = extract_contents(row.find_all('td')) 
#Notice that the data that we require is now a list of length 5.
if len(stat) == 5:
    stats.append(stat)

#Now convert the data into a pandas dataframe for further processing.
new_cols = []
for each_new_col in row:
    petfinder_data = pd.DataFrame(data = petfinder_data, columns = each_new_col)
    petfinder_data.head()

(x_train, y_train), (x_test, y_test) = petfinder_data
x_train = x_train[:100]
y_train = y_train[:100]
print(x_train.shape)  
print(y_train.shape)  
print(y_train[:3]) 

# Initialize the image regressor.
reg = ak.ImageRegressor(overwrite=True, max_trials=1)

# Feed the image regressor with training data.
reg.fit(x_train, y_train, epochs=2)

# Predict with the best model.
predicted_y = reg.predict(x_test)
print(predicted_y)

# Evaluate the best model with testing data.
print(reg.evaluate(x_test, y_test))

#Validation data
reg.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
    epochs=2000,
)

#Customized search space
input_node = ak.ImageInput()
output_node = ak.ImageBlock(
    # Only search ResNet architectures.
    block_type="resnet",
    # Normalize the dataset.
    normalize=False,
    # Do not do data augmentation.
    augment=False,
)(input_node)
output_node = ak.RegressionHead()(output_node)
reg = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=100
)

reg.fit(x_train, y_train, epochs=2000)

input_node = ak.ImageInput()
output_node = ak.Normalization()(input_node)
output_node = ak.ImageAugmentation(horizontal_flip=False)(output_node)
output_node = ak.ResNetBlock(version="v2")(output_node)
output_node = ak.RegressionHead()(output_node)

reg = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=100
)

reg.fit(x_train, y_train, epochs=2000)

#Data format
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Reshape the images to have the channel dimension.
x_train = x_train.reshape(x_train.shape + (1,))
x_test = x_test.reshape(x_test.shape + (1,))
y_train = y_train.reshape(y_train.shape + (1,))
y_test = y_test.reshape(y_test.shape + (1,))

print(x_train.shape) 
print(y_train.shape)  

train_set = tf.data.Dataset.from_tensor_slices(((x_train,), (y_train,)))
test_set = tf.data.Dataset.from_tensor_slices(((x_test,), (y_test,)))

reg = ak.ImageRegressor(overwrite=True, max_trials=100)

# Feed the tensorflow Dataset to the regressor.
reg.fit(train_set, epochs=2000)

# Predict with the best model.
predicted_y = reg.predict(test_set)

# Evaluate the best model with testing data.
print(reg.evaluate(test_set))

**Deliverables:**

*   Python package with an Automated ML function to be called using any data frame (dataset) to give a good trained model.
*   Details on the steps automated and the scenario in which they execute.
*   Any scenario which is not automated.

TPOT cannot solve multi-label regression problems at this point of time.