# Python files that contains 'sklearn'

In [1]:
from bigquery_python_framework.GithubPython import GithubPython

In [6]:
sklearnFile = GithubPython().uniqueFiles().contains('sklearn').excludeByRepoName('sklearn').getCount().run()[0][0]
print("There are {} files containing 'sklearn'".format(sklearnFile))

There are 43629 files containing 'sklearn'


In [7]:
totalFile = GithubPython().uniqueFiles().getCount().run()[0][0]

In [8]:
print('There are {} files in total'.format(totalFile))

There are 5995653 files in total


In [13]:
print('Percentage: %f%%'%(sklearnFile/totalFile*100))

Percentage: 0.727677%


# Modules that import most 'sklearn'

In [2]:
moduleImportMostSklearn = GithubPython().module_with_most_import()

In [5]:
moduleImportMostSklearn[:20]

[('seckcoder/lang-learn', 121),
 ('magic2du/contact_matrix', 96),
 ('jpzk/evopy', 87),
 ('GbalsaC/bitnamiP', 71),
 ('loli/sklearn-ensembletrees', 61),
 ('chaluemwut/fbserver', 57),
 ('zooniverse/aggregation', 51),
 ('B3AU/waveTree', 49),
 ('valexandersaulys/airbnb_kaggle_contest', 47),
 ('kedz/cuttsum', 47),
 ('Tjorriemorrie/trading', 47),
 ('southpaw94/MachineLearning', 47),
 ('akhilpm/Masters-Project', 46),
 ('salma1601/nilearn', 45),
 ('diogo149/CauseEffectPairsPaper', 42),
 ('NicovincX2/Python-3.5', 38),
 ('chemelnucfin/tensorflow', 37),
 ('abenicho/isvr', 37),
 ('ainafp/nilearn', 37),
 ('weissercn/MLTools', 37)]

# Context

In [39]:
def printContext(fileName = 'RFC.csv', num = 10):
    import csv
    with open('context/{}'.format(fileName),'r') as f:
        spamreader = csv.reader(f, delimiter=',', quotechar='"')
        i = 0
        for row in spamreader:
            if i < 2:
                i += 1
                continue
            if i >= num + 2:
                break
            print(row[0])
            print('\n------------------------------Separator------------------------------\n')
            i += 1

In [40]:
printContext()

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


# classification models
classifiers = {'K-Nearest Neighbors (Braycurtis norm)':
               KNeighborsClassifier(n_neighbors=3, algorithm='auto',
                                    metric='braycurtis'),
               'Random Forest':
               RandomForestClassifier(n_estimators=80, n_jobs=1),
               'SVM': SVC(gamma=2, C=1),
               'Linear Support Vector Machine': SVC(kernel="linear", C=0.025),
               'Decision Tree': DecisionTreeClassifier(max_depth=5),
               'Ada Boost': AdaBoostClassifier(n_estimators=80,
                                               learning_rate=0.4),
               'Naive Bayes': GaussianNB(),
               }
vc = VotingClassifier(estimators=list(classifiers.items()), voting='hard')


------------------------------Separator------------------------------

from sklearn.ensemble import AdaBoost

# BaseEstimator + @deprecated

In [62]:
queryString = """\
SELECT
  FIRST(sample_repo_name),
  sample_path,
FROM 
  [scikit-learn-research:pyfiles.content_py] 
WHERE
  content CONTAINS 'BaseEstimator'
  AND content CONTAINS '@deprecated'
  AND (NOT RIGHT(sample_repo_name,12) = "scikit-learn")
  AND (NOT RIGHT(sample_repo_name,7) = "sklearn")
  AND (NOT sample_path CONTAINS 'sklearn')
  AND (NOT sample_path CONTAINS 'scikit-learn')
GROUP BY 2
"""
baseEstimatorDeprecated = GithubPython()
result = baseEstimatorDeprecated.run(queryString)

In [63]:
def getGithubURL(result):
    for repo_name, path in result:
        print("https://github.com/{}/tree/master/{}".format(repo_name,path))

In [67]:
seen = []
resultFiltered = []
for repo, path in result:
    if "/".join(path.split("/")[-3:]) not in seen:
        resultFiltered.append([repo,path])
        seen.append("/".join(path.split("/")[-3:]))

In [68]:
getGithubURL(resultFiltered)

https://github.com/AsimmHirani/ISpyPi/tree/master/tensorflow/contrib/tensorflow-master/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
https://github.com/mengxn/tensorflow/tree/master/tensorflow/contrib/learn/python/learn/estimators/estimator.py
https://github.com/Denisolt/Tensorflow_Chat_Bot/tree/master/local/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/utils/export.py
https://github.com/liyi193328/seq2seq/tree/master/seq2seq/contrib/monitors.py
https://github.com/psarka/uplift/tree/master/uplift/base.py
https://github.com/Bismarrck/tensorflow/tree/master/tensorflow/contrib/learn/python/learn/estimators/linear.py
https://github.com/AsimmHirani/ISpyPi/tree/master/tensorflow/contrib/tensorflow-master/tensorflow/contrib/learn/python/learn/estimators/dnn.py
https://github.com/pierreg/tensorflow/tree/master/tensorflow/contrib/learn/python/learn/estimators/svm.py
https://github.com/liyi193328/seq2seq/tree/master/seq2seq/contrib/estimator.py
https

In [69]:
print("There are {} files with BaseEstimator + @deprecated".format(len(resultFiltered)))

There are 23 files with BaseEstimator + @deprecated


## Context of @deprecated in above files

In [86]:
def getContext(modelName):
  from google.cloud import bigquery
  client = bigquery.Client()
  query = '''\
  #standardSQL
  CREATE TEMPORARY FUNCTION parsePythonFile(a STRING)
  RETURNS STRING
  LANGUAGE js AS """
    if (a === null) {
      return null;
    }
    var lines = a.split('\\\\n');
    for (i=0;i<lines.length;i++) {
      if (lines[i].indexOf("%s")!==-1){
        return lines.slice(Math.max(i-10,0),Math.min(i+10,lines.length-1)).join("\\\\n");
      }
    }
  """;

  CREATE TEMPORARY FUNCTION parsePythonFile2(a STRING, b STRING)
  RETURNS STRING
  LANGUAGE js AS """
    if (a === null) {
      return null;
    }
    var lines = a.split('\\\\n');
    for (i=0;i<lines.length;i++) {
      if (lines[i].indexOf("%s")!==-1){
        return b;
      }
    }
  """;

  SELECT
    parsePythonFile(content) match,
    parsePythonFile2(content,sample_path) path,
    parsePythonFile2(content,sample_repo_name ) repo_name,
    count(*) count
  FROM   
    `scikit-learn-research.pyfiles.content_py` 
  WHERE
    (NOT ENDS_WITH(sample_repo_name, "scikit-learn"))
     AND (NOT ENDS_WITH(sample_repo_name, "sklearn"))
     AND NOT STRPOS(content,'BaseEstimator') = 0
     AND STRPOS(sample_path,'sklearn') = 0
     AND STRPOS(sample_path,'scikit-learn') = 0
  GROUP BY
  1,2,3
  ORDER BY 
  count DESC
  '''% (modelName,modelName)
  result = client.run_sync_query(query)
  result.timeout_ms = 99999999
  result.run()
  return result

In [87]:
resultForContext = getContext('@deprecated')

In [88]:
resultForContext = resultForContext.rows

In [89]:
for val, _, _, _ in resultForContext[:20]:
    print(val)
    print('\n------------------------------Separator------------------------------\n')

None

------------------------------Separator------------------------------

    if self._n_classes == 2:
      metrics.update({
          "auc": metric_spec.MetricSpec(
              metric_fn=metrics_lib.streaming_auc,
              prediction_key=_LOGISTIC,
              weight_key=self._weight_column_name)})
    return self._estimator.evaluate(
        x=x, y=y, input_fn=input_fn, feed_fn=feed_fn, batch_size=batch_size,
        steps=steps, metrics=metrics, name=name)

  @deprecated_arg_values(
      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
      as_iterable=False)
  def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=False):
    """Returns predicted classes for given features.

    Args:
      x: features.
      input_fn: Input function. If set, x must be None.
      batch_size: Override default batch size.

------------------------------Separator------------------------------

          matched.append(key)
    return matched, non_matched



## How about DeprecationWarning + BaseEstimator?

In [78]:
queryStringForDeprecationWarning = """\
SELECT
  FIRST(sample_repo_name),
  sample_path,
FROM 
  [scikit-learn-research:pyfiles.content_py] 
WHERE
  content CONTAINS 'BaseEstimator'
  AND content CONTAINS 'DeprecationWarning'
  AND (NOT RIGHT(sample_repo_name,12) = "scikit-learn")
  AND (NOT RIGHT(sample_repo_name,7) = "sklearn")
  AND (NOT sample_path CONTAINS 'sklearn')
  AND (NOT sample_path CONTAINS 'scikit-learn')
GROUP BY 2
"""
baseEstimatorDeprecated = GithubPython()
result = baseEstimatorDeprecated.run(queryStringForDeprecationWarning)

In [79]:
seen = []
resultFiltered = []
for repo, path in result:
    if "/".join(path.split("/")[-3:]) not in seen:
        resultFiltered.append([repo,path])
        seen.append("/".join(path.split("/")[-3:]))

In [80]:
getGithubURL(resultFiltered)

https://github.com/annapasca/mne-python/tree/master/mne/fixes.py
https://github.com/ml-slac/deep-jets/tree/master/training/fisher.py
https://github.com/rafwiewiora/msmbuilder/tree/master/msmbuilder/tests/test_estimator_subclassing.py
https://github.com/msultan/osprey/tree/master/osprey/eval_scopes.py
https://github.com/gusseppe/pymach/tree/master/pymach/improve.py
https://github.com/cxhernandez/msmbuilder/tree/master/msmbuilder/cluster/__init__.py
https://github.com/tgsmith61591/skutil/tree/master/skutil/h2o/split.py
https://github.com/Sklearn-HMM/scikit-learn-HMM/tree/master/sklean-hmm/linear_model/stochastic_gradient.py
https://github.com/anonymous-ijcai/dsw-ont-ijcai/tree/master/dswont/relation_type.py
https://github.com/JosmanPS/parallel-SVM/tree/master/m_learning/base.py
https://github.com/dwettstein/pattern-recognition-2016/tree/master/mlp/gaussian_process/gaussian_process.py
https://github.com/likelyzhao/mxnet/tree/master/python/mxnet/model.py
https://github.com/slipguru/adenine

In [81]:
print("There are {} files with BaseEstimator + DeprecationWarning".format(len(resultFiltered)))

