In [3]:
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "github_repos" dataset
dataset_ref = client.dataset("github_repos", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "sample_commits" table
table_ref = dataset_ref.table("sample_commits")

# API request - fetch the table
sample_commits_table = client.get_table(table_ref)

# Preview the first five lines of the table
client.list_rows(sample_commits_table, max_results=5).to_dataframe()



Unnamed: 0,commit,tree,parent,author,committer,subject,message,trailer,difference,difference_truncated,repo_name,encoding
0,afdba32e2a9ea729a9f9f280dbf6c718773c7ded,d77cca8a096e5320f3194d4a6ca1b4fef2dc9b99,[d65e55d4999b394e37ffe12543ecd2a17b7c44fc],"{'name': 'Jason Gunthorpe', 'email': 'a99b91d7...","{'name': 'Peter Huewe', 'email': '014f16385c5a...",tpm: Pull everything related to /dev/tpmX into...,tpm: Pull everything related to /dev/tpmX into...,"[{'key': 'Signed-off-by', 'value': 'Jason Gunt...","[{'old_mode': 33188, 'new_mode': 33188, 'old_p...",,torvalds/linux,
1,eb846d9f147455e4e5e1863bfb5e31974bb69b7c,443efbb146c7824508be817923bab04c2185810e,[3af6b35261182ff185db1f0fd271254147e2663e],"{'name': 'Hannes Reinecke', 'email': 'b0d1e9e4...","{'name': 'Christoph Hellwig', 'email': '923f77...",scsi: rename SERVICE_ACTION_IN to SERVICE_ACTI...,scsi: rename SERVICE_ACTION_IN to SERVICE_ACTI...,"[{'key': 'Signed-off-by', 'value': 'Hannes Rei...","[{'old_mode': 33188, 'new_mode': 33188, 'old_p...",,torvalds/linux,
2,f8798ccbefc0e4ef7438c080b7ba0410738c8cfa,9133440693c02314f1f6f95629b3594ce24ad0f8,[261e767628bb5971b9032439818237cc8511ea94],"{'name': 'Yong Zhang', 'email': '34add0fe16a1f...","{'name': 'Florian Tobias Schandinat', 'email':...",video: irq: Remove IRQF_DISABLED,video: irq: Remove IRQF_DISABLED\n\nSince comm...,"[{'key': 'Signed-off-by', 'value': 'Yong Zhang...","[{'old_mode': 33188, 'new_mode': 33188, 'old_p...",,torvalds/linux,
3,b83ae6d421435c6204150300f1c25bfbd39cd62b,99c6b661ab7de05c2fd49aa62624d2d6bf8abc69,[de1414a654e66b81b5348dbc5259ecf2fb61655e],"{'name': 'Christoph Hellwig', 'email': '923f77...","{'name': 'Jens Axboe', 'email': 'cd8c6775e60d6...",fs: remove mapping->backing_dev_info,fs: remove mapping->backing_dev_info\n\nNow th...,"[{'key': 'Signed-off-by', 'value': 'Christoph ...","[{'old_mode': 33188, 'new_mode': 33188, 'old_p...",,torvalds/linux,
4,aaabee8b7686dfe49f10289cb4b7a817b99e5dd9,7ccc6cf829a93d46daf484164a5466c91eca2efa,"[795e9364215dc98b1dea888ebae22383ecbbb92a, 2f2...","{'name': 'Luciano Coelho', 'email': 'd1ef58086...","{'name': 'Luciano Coelho', 'email': 'd1ef58086...",Merge branch 'wl12xx-next' into for-linville,Merge branch 'wl12xx-next' into for-linville\n...,"[{'key': 'Conflicts', 'value': '', 'email': No...","[{'old_mode': 33188, 'new_mode': 33188, 'old_p...",,torvalds/linux,


In [6]:
max_commits_query = """
                  SELECT committer.name as committer_name, COUNT(*) as num_commits
                  FROM `bigquery-public-data.github_repos.sample_commits`
                  WHERE committer.date >= '2016-01-01' AND committer.date < '2017-01-01'
                    GROUP BY committer_name
                    ORDER BY num_commits DESC
                  """
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
max_commits_query_job = client.query(max_commits_query, job_config=safe_config)
max_commits_result = max_commits_query_job.to_dataframe()
print(max_commits_result.head())

          committer_name  num_commits
0     Greg Kroah-Hartman         3545
1        David S. Miller         3120
2  TensorFlower Gardener         2449
3         Linus Torvalds         2424
4        Benjamin Pasero         1127


In [7]:
# Construct a reference to the "languages" table
table_ref = dataset_ref.table("languages")

# API request - fetch the table
languages_table = client.get_table(table_ref)

# Preview the first five lines of the table
client.list_rows(languages_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,language
0,lemi136/puntovent,"[{'name': 'C', 'bytes': 80}]"
1,taxigps/nctool,"[{'name': 'C', 'bytes': 4461}]"
2,ahy1/strbuf,"[{'name': 'C', 'bytes': 5573}]"
3,nleiten/mod_rpaf-ng,"[{'name': 'C', 'bytes': 30330}]"
4,kmcallister/alameda,"[{'name': 'C', 'bytes': 17077}]"


In [14]:
pop_lang_query = """
    SELECT language_list.name AS language_name, COUNT(*) AS num_repos
    FROM `bigquery-public-data.github_repos.languages`,
        UNNEST(language) AS language_list
    GROUP BY language_name
    ORDER BY num_repos DESC
    """
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
pop_lang_query_job = client.query(pop_lang_query, job_config=safe_config)
pop_lang_result = pop_lang_query_job.to_dataframe()
print(pop_lang_result.head())

  language_name  num_repos
0    JavaScript    1099966
1           CSS     807826
2          HTML     777433
3         Shell     640886
4        Python     550905


In [17]:
all_lang_query = """
    SELECT language_list.name AS name, language_list.bytes AS bytes
    FROM `bigquery-public-data.github_repos.languages`,
        UNNEST(language) AS language_list
    WHERE repo_name = 'polyrabbit/polyglot'
    ORDER BY bytes DESC
    """
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
all_lang_query_job = client.query(all_lang_query, job_config=safe_config)
all_lang_result = all_lang_query_job.to_dataframe()
print(all_lang_result)

                    name   bytes
0                  Lasso  834726
1                      C  819142
2                Mercury  709952
3            Objective-C  495392
4    Game Maker Language  298131
..                   ...     ...
211                   XC      82
212              Arduino      81
213               Nimrod      43
214           AutoHotkey      23
215                  Tea      20

[216 rows x 2 columns]
