# Python files that contains 'sklearn'

In [2]:
from bigquery_python_framework.GithubPython import GithubPython

In [6]:
sklearnFile = GithubPython().uniqueFiles().contains('sklearn').excludeByRepoName('sklearn').getCount().run()[0][0]
print("There are {} files containing 'sklearn'".format(sklearnFile))

There are 43629 files containing 'sklearn'


In [7]:
totalFile = GithubPython().uniqueFiles().getCount().run()[0][0]

In [8]:
print('There are {} files in total'.format(totalFile))

There are 5995653 files in total


In [13]:
print('Percentage: %f%%'%(sklearnFile/totalFile*100))

Percentage: 0.727677%


# Modules that import most 'sklearn'

In [2]:
moduleImportMostSklearn = GithubPython().module_with_most_import()

In [5]:
moduleImportMostSklearn[:20]

[('seckcoder/lang-learn', 121),
 ('magic2du/contact_matrix', 96),
 ('jpzk/evopy', 87),
 ('GbalsaC/bitnamiP', 71),
 ('loli/sklearn-ensembletrees', 61),
 ('chaluemwut/fbserver', 57),
 ('zooniverse/aggregation', 51),
 ('B3AU/waveTree', 49),
 ('valexandersaulys/airbnb_kaggle_contest', 47),
 ('kedz/cuttsum', 47),
 ('Tjorriemorrie/trading', 47),
 ('southpaw94/MachineLearning', 47),
 ('akhilpm/Masters-Project', 46),
 ('salma1601/nilearn', 45),
 ('diogo149/CauseEffectPairsPaper', 42),
 ('NicovincX2/Python-3.5', 38),
 ('chemelnucfin/tensorflow', 37),
 ('abenicho/isvr', 37),
 ('ainafp/nilearn', 37),
 ('weissercn/MLTools', 37)]

# Context

In [39]:
def printContext(fileName = 'RFC.csv', num = 10):
    import csv
    with open('context/{}'.format(fileName),'r') as f:
        spamreader = csv.reader(f, delimiter=',', quotechar='"')
        i = 0
        for row in spamreader:
            if i < 2:
                i += 1
                continue
            if i >= num + 2:
                break
            print(row[0])
            print('\n------------------------------Separator------------------------------\n')
            i += 1

In [40]:
printContext()

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


# classification models
classifiers = {'K-Nearest Neighbors (Braycurtis norm)':
               KNeighborsClassifier(n_neighbors=3, algorithm='auto',
                                    metric='braycurtis'),
               'Random Forest':
               RandomForestClassifier(n_estimators=80, n_jobs=1),
               'SVM': SVC(gamma=2, C=1),
               'Linear Support Vector Machine': SVC(kernel="linear", C=0.025),
               'Decision Tree': DecisionTreeClassifier(max_depth=5),
               'Ada Boost': AdaBoostClassifier(n_estimators=80,
                                               learning_rate=0.4),
               'Naive Bayes': GaussianNB(),
               }
vc = VotingClassifier(estimators=list(classifiers.items()), voting='hard')


------------------------------Separator------------------------------

from sklearn.ensemble import AdaBoost

# BaseEstimator + @deprecated

In [10]:
queryString = """\
SELECT
  sample_repo_name,
  sample_path,
FROM 
  [scikit-learn-research:pyfiles.content_py] 
WHERE
  content CONTAINS 'BaseEstimator'
  AND content CONTAINS '@deprecated'
  AND (NOT RIGHT(sample_repo_name,12) = "scikit-learn")
  AND (NOT RIGHT(sample_repo_name,7) = "sklearn")
"""
baseEstimatorDeprecated = GithubPython()
result = baseEstimatorDeprecated.run(queryString)

In [15]:
def getGithubURL(result):
    for repo_name, path in result:
        print("https://github.com/{}/tree/master/{}".format(repo_name,path))

In [16]:
getGithubURL(result)

https://github.com/chaluemwut/fbserver/tree/master/venv/lib/python2.7/site-packages/sklearn/feature_extraction/text.py
https://github.com/DailyActie/Surrogate-Model/tree/master/projects/scikit-learn-master/sklearn/preprocessing/data.py
https://github.com/juharris/tensorflow/tree/master/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
https://github.com/naturali/tensorflow/tree/master/tensorflow/contrib/learn/python/learn/monitors.py
https://github.com/sandeepdsouza93/TensorFlow-15712/tree/master/tensorflow/contrib/learn/python/learn/estimators/dnn.py
https://github.com/nanditav/15712-TensorFlow/tree/master/tensorflow/contrib/learn/python/learn/monitors.py
https://github.com/alisidd/tensorflow/tree/master/tensorflow/contrib/learn/python/learn/monitors.py
https://github.com/mengxn/tensorflow/tree/master/tensorflow/contrib/learn/python/learn/estimators/estimator.py
https://github.com/nanditav/15712-TensorFlow/tree/master/tensorflow/contrib/learn/python/learn/estimators/li

## Context of @deprecated in above files

In [33]:
def getContext(modelName):
  from google.cloud import bigquery
  client = bigquery.Client()
  query = '''\
  #standardSQL
  CREATE TEMPORARY FUNCTION parsePythonFile(a STRING)
  RETURNS STRING
  LANGUAGE js AS """
    if (a === null) {
      return null;
    }
    var lines = a.split('\\\\n');
    for (i=0;i<lines.length;i++) {
      if (lines[i].indexOf("%s")!==-1){
        return lines.slice(Math.max(i-10,0),Math.min(i+10,lines.length-1)).join("\\\\n");
      }
    }
  """;

  CREATE TEMPORARY FUNCTION parsePythonFile2(a STRING, b STRING)
  RETURNS STRING
  LANGUAGE js AS """
    if (a === null) {
      return null;
    }
    var lines = a.split('\\\\n');
    for (i=0;i<lines.length;i++) {
      if (lines[i].indexOf("%s")!==-1){
        return b;
      }
    }
  """;

  SELECT
    parsePythonFile(content) match,
    parsePythonFile2(content,sample_path) path,
    parsePythonFile2(content,sample_repo_name ) repo_name,
    count(*) count
  FROM   
    `scikit-learn-research.pyfiles.content_py` 
  WHERE
    (NOT ENDS_WITH(sample_repo_name, "scikit-learn"))
     AND (NOT ENDS_WITH(sample_repo_name, "sklearn"))
     AND NOT STRPOS(content,'BaseEstimator') = 0
  GROUP BY
  1,2,3
  ORDER BY 
  count DESC
  '''% (modelName,modelName)
  result = client.run_sync_query(query)
  result.timeout_ms = 99999999
  result.run()
  return result

In [34]:
resultForContext = getContext('@deprecated')

In [39]:
resultForContext = resultForContext.rows

In [43]:
for val, _, _, _ in resultForContext[:20]:
    print(val)
    print('\n------------------------------Separator------------------------------\n')

None

------------------------------Separator------------------------------

    if self._n_classes == 2:
      metrics.update({
          "auc": metric_spec.MetricSpec(
              metric_fn=metrics_lib.streaming_auc,
              prediction_key=_LOGISTIC,
              weight_key=self._weight_column_name)})
    return self._estimator.evaluate(
        x=x, y=y, input_fn=input_fn, feed_fn=feed_fn, batch_size=batch_size,
        steps=steps, metrics=metrics, name=name)

  @deprecated_arg_values(
      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
      as_iterable=False)
  def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=True):
    """Returns predicted classes for given features.

    Args:
      x: features.
      input_fn: Input function. If set, x must be None.
      batch_size: Override default batch size.

------------------------------Separator------------------------------

    """

    def __init__(self, y_min=None, y_max=None, increasi

## How about DeprecationWarning + BaseEstimator?

In [17]:
queryStringForDepreciationWarning = """\
SELECT
  sample_repo_name,
  sample_path,
FROM 
  [scikit-learn-research:pyfiles.content_py] 
WHERE
  content CONTAINS 'BaseEstimator'
  AND content CONTAINS 'DepreciationWarning'
  AND (NOT RIGHT(sample_repo_name,12) = "scikit-learn")
  AND (NOT RIGHT(sample_repo_name,7) = "sklearn")
"""
baseEstimatorDeprecated = GithubPython()
result = baseEstimatorDeprecated.run(queryStringForDepreciationWarning)
getGithubURL(result) #0 result

## Only DepreciationWarning?

In [21]:
queryStringForDepreciationWarningOnly = """\
SELECT
  sample_repo_name,
  sample_path,
FROM 
  [scikit-learn-research:pyfiles.content_py] 
WHERE
  content CONTAINS 'DepreciationWarning'
  AND (NOT RIGHT(sample_repo_name,12) = "scikit-learn")
  AND (NOT RIGHT(sample_repo_name,7) = "sklearn")
"""
depreciationOnly = GithubPython().run(queryStringForDepreciationWarningOnly)
getGithubURL(depreciationOnly)

https://github.com/jaduimstra/nilmtk/tree/master/nilmtk/metrics.py
https://github.com/mmottahedi/nilmtk/tree/master/nilmtk/feature_detectors/steady_states.py
https://github.com/total-impact/total-impact-core/tree/master/totalimpact/providers/provider.py
https://github.com/jaduimstra/nilmtk/tree/master/nilmtk/disaggregate/combinatorial_optimisation.py
https://github.com/josemao/nilmtk/tree/master/nilmtk/metrics.py
https://github.com/josemao/nilmtk/tree/master/nilmtk/disaggregate/combinatorial_optimisation.py
https://github.com/rvbelefonte/Rockfish2/tree/master/rockfish2/navigation/ukooa/p190/database.py
https://github.com/AlexRobson/nilmtk/tree/master/nilmtk/feature_detectors/cluster.py
https://github.com/Impactstory/total-impact-webapp/tree/master/totalimpact/providers/provider.py
https://github.com/nilmtk/nilmtk/tree/master/nilmtk/disaggregate/combinatorial_optimisation.py
