In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# define RandomForest416
import scipy.stats 

class RandomForest416: 
    """
    This class implements the common sklearn model interface (has a fit and predict function).
    
    A random forest is a collection of decision trees that are trained on random subsets of the 
    dataset. When predicting the value for an example, takes a majority vote from the trees.
    """
    
    def __init__(self, num_trees, max_depth=None):
        """
        Constructs a RandomForest416 that uses the given numbner of trees, each with a 
        max depth of max_depth.
        """
        # the self._trees object is a list of models, you can use it in the next function to loop over
        # and fit the models one by one
        self._trees = [
            DecisionTreeClassifier(max_depth=max_depth) 
            for i in range(num_trees)
        ]
        
    def fit(self, X, y):
        """
        Takes an input dataset X and a series of targets y and trains the RandomForest416.
        
        Each tree will be trained on a random sample of the data that samples the examples
        uniformly at random (with replacement). Each random dataset will have the same number
        of examples as the original dataset, but some examples may be missing or appear more 
        than once due to the random sampling with replacement.
        """    
        for tree in self._trees:
          index = np.random.randint(0, len(X), size = len(X))
          sample = X.iloc[index]
          target = y.iloc[index]
          tree.fit(sample, target)
        
            
    def predict(self, X):
        """
        Takes an input dataset X and returns the predictions for each example in X.
        """
        
        # Builds up a 2d array with n rows and T columns
        # where n is the number of points to classify and T is the number of trees
        predictions = np.zeros((len(X), len(self._trees)))
        for i, tree in enumerate(self._trees):
            # Make predictions using the current tree
            preds = tree.predict(X)
            
            # Store those predictions in ith column of the 2d array
            predictions[:, i] = preds
            
        # For each row of predictions, find the most frequent label (axis=1 means across columns)
        return scipy.stats.mode(predictions, axis=1)[0]

In [None]:
import numpy as np
import pandas as pd

edx_train = pd.read_csv('/kaggle/input/cse-stat-416-sum-20/edx_train.csv')
df_test = pd.read_csv('/kaggle/input/cse-stat-416-sum-20/edx_test.csv')
to_save = df_test[['userid_DI']].copy()
print(len(df_test))
# pre-process the data 
del edx_train['course_id']
del edx_train['userid_DI']
del edx_train['registered']
del edx_train['start_time_DI']
del edx_train['last_event_DI']
#del edx_train['certified']
edx_train = pd.get_dummies(edx_train)
del df_test['course_id']
del df_test['userid_DI']
del df_test['registered']
del df_test['start_time_DI']
del df_test['last_event_DI']
df_test = pd.get_dummies(df_test)

features = list(edx_train.columns)
features.remove('certified')

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


rf = RandomForest416(5, max_depth=10)
rf.fit(edx_train[features], edx_train['certified'])
#pred_rf = rf.predict(validation_data[features])
# The code to make the predictions on the test data 
predictions = rf.predict(df_test[features])
new_list = []
for item in predictions:
    new_list.append(int(item))


to_save.loc[:, 'certified'] = new_list
to_save.to_csv('submission.csv', index=False)
print(to_save)
