# Install requirements

In [1]:
!pip install sklearn pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=5a01052621ba8644cce531a17b6db8262187c89b883ce993ad0c869d9f212913
  Stored in directory: /root/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


# Read dataset

In [2]:
import pandas as pd
from google.colab import drive 
drive = drive.mount('/content/drive') 

Mounted at /content/drive


# Call **Random Fores**t from `sklearn`

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.tree import export_graphviz
from IPython import display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

projects = ['Android-Universal-Image-Loader','antlr4','BroadleafCommerce','ceylon-ide-eclipse','elasticsearch','hazelcast','junit','MapDB','mcMMO','mct','neo4j','netty','orientdb','oryx','titan']
for project in projects:
  '''Read data from CSV with Pandas'''
  ignored_cols = ['Hash', 'LongName']

  _file =f"drive/MyDrive/subtract/{project}/class.csv"
  cols = list(pd.read_csv(_file, nrows =1).dropna(axis='columns', how='all'))
  cols.remove('Hash')
  cols.remove('LongName')

  df = pd.read_csv(_file,  usecols =cols)
  normalized_df=(df-df.mean())/df.std()

  '''Split into train and test with numpy (0.75 train, 0.25 test)'''
  from sklearn.model_selection import train_test_split
  train, test = train_test_split(df, test_size=0.2, random_state=23)
  
  '''First 4 colums are the data to train'''
  features = df.columns[:-1]

  '''Create random forest classifier'''
  model = RandomForestClassifier(n_jobs=2, max_depth = 10, random_state = 0)
    
  '''Species names to factor'''
  y, bugs = pd.factorize(train['Number of Bugs'])
  
  '''Train and test'''
  model.fit(train[features], y)
  preds = bugs.values[model.predict(test[features])]
  actual_classes = test.iloc[:,-1:]

  score = accuracy_score(actual_classes, preds)
  accuracy = "%.2f" % round(score*100, 2)

  from sklearn.metrics import f1_score
  f1 = f1_score(actual_classes, preds, average = 'macro')
  f1_score = "%.2f"%round(f1, 2)

  print(f'Project: {project} => Accuracy: {accuracy} ; F1: {f1_score}')


Project: Android-Universal-Image-Loader => Accuracy: 31.25 ; F1: 0.18
Project: antlr4 => Accuracy: 47.62 ; F1: 0.27
Project: BroadleafCommerce => Accuracy: 55.07 ; F1: 0.22
Project: ceylon-ide-eclipse => Accuracy: 64.71 ; F1: 0.44
Project: elasticsearch => Accuracy: 49.61 ; F1: 0.12
Project: hazelcast => Accuracy: 47.57 ; F1: 0.18
Project: junit => Accuracy: 65.62 ; F1: 0.50
Project: MapDB => Accuracy: 37.22 ; F1: 0.16
Project: mcMMO => Accuracy: 47.62 ; F1: 0.21
Project: mct => Accuracy: 71.43 ; F1: 0.58
Project: neo4j => Accuracy: 44.80 ; F1: 0.18
Project: netty => Accuracy: 60.48 ; F1: 0.27
Project: orientdb => Accuracy: 47.16 ; F1: 0.17
Project: oryx => Accuracy: 85.00 ; F1: 0.46
Project: titan => Accuracy: 52.33 ; F1: 0.55
