## Storage API sample

In [1]:
import gcp
import gcp.storage as storage
from gcp.context import Context
import random
import pandas as pd
from StringIO import StringIO
project = Context.default().project_id

In [2]:
bucket_name = "yukoga-kaggle"
bucket_path = "gs://" + bucket_name
test_sample_size = 1000
train_sample_size = 1000
sample_submission_sample_size = 1000

In [6]:
%%bash 
curl --silent -H "Metadata-Flavor: Google" http://metadata/computeMetadata/v1/instance/service-accounts/default/email

921806060722-compute@developer.gserviceaccount.com

In [3]:
# get skiprows for pandas.DataFrame
def get_skiprows(sample_size, num_records):
  return sorted(random.sample(range(1,num_records),num_records - sample_size))

In [4]:
# test data
%storage read --object "gs://yukoga-kaggle/facebook-checkin/test.csv" --variable tmp_table
num_records = len(tmp_table.split('\n'))
test = pd.read_csv(StringIO(tmp_table), skiprows=get_skiprows(test_sample_size, num_records))
del tmp_table

In [5]:
test.head()

Unnamed: 0,row_id,x,y,accuracy,time
0,2846,6.5062,7.8125,163,902890
1,3545,4.061,2.3397,70,947237
2,16838,4.6995,2.8846,65,790867
3,26387,8.6525,0.5807,65,841739
4,31764,6.5354,7.0423,61,885250


In [None]:
# get sampled records from cloud storage
def read_sampled_lines(item, sample_size):
  """Reads the content of this item as text, and return a list of lines up to some max.

  Args:
    item: item object from Google Cloud Storage.
    start_offset_line: an index indicates start offset records within a item.
    max_lines: max number of lines to return. If None, return all lines.
  Returns:
    The text content of the item as a list of lines.
  Raises:
    Exception if there was an error requesting the item's content.
  """
  def read_specific_lines(item, offset, num_records):
    start_to_read = 100 * (0 if offset is None else offset)
    max_to_read = item.metadata.size
    num_records = max_to_read if num_records is None else num_records
    bytes_to_read = min(100 * num_records, item.metadata.size)
    
    lines = []
    while True:
      content = item.read_from(start_offset=start_to_read, bytes_to_read)
      lines = content.split('\n')
      if len(lines) > num_records or bytes_to_read >= max_to_read:
        break
      bytes_to_read = min_lines or bytes_to_read >= max_to_read:

    del lines[-1]
    return lines[0:num_records]


In [13]:
mybucket = storage.Bucket(bucket_name)
for item in mybucket.items():
  print item.metadata.name + " : " + str(item.metadata.size)
  
help(item.read_lines)

expedia/ : 0
expedia/destinations.csv : 138159416
expedia/sample_submission.csv : 31756066
expedia/test.csv : 276554476
expedia/train.csv : 4070445781
facebook-checkin/sample_submission.csv : 351785336
facebook-checkin/test.csv : 273911533
facebook-checkin/train.csv : 1268930440
Help on method read_lines in module gcp.storage._item:

read_lines(self, max_lines=None) method of gcp.storage._item.Item instance
    Reads the content of this item as text, and return a list of lines up to some max.
    
    Args:
      max_lines: max number of lines to return. If None, return all lines.
    Returns:
      The text content of the item as a list of lines.
    Raises:
      Exception if there was an error requesting the item's content.



In [9]:
import inspect
print inspect.getsource(item._api.object_download)

  def object_download(self, bucket, key, start_offset=0, byte_count=None):
    """Reads the contents of an object as text.

    Args:
      bucket: the name of the bucket containing the object.
      key: the key of the object to be read.
      start_offset: the start offset of bytes to read.
      byte_count: the number of bytes to read. If None, it reads to the end.
    Returns:
      The text content within the object.
    Raises:
      Exception if the object could not be read from.
    """
    args = {'alt': 'media'}
    headers = {}
    if start_offset > 0 or byte_count is not None:
      header = 'bytes=%d-' % start_offset
      if byte_count is not None:
        header += '%d' % byte_count
      headers['Range'] = header
    url = Api._DOWNLOAD_ENDPOINT + (Api._OBJECT_PATH % (bucket, Api._escape_key(key)))
    return gcp._util.Http.request(url, args=args, headers=headers,
        credentials=self._credentials, raw_response=True)



In [16]:
print inspect.getsource(item.read_from)

  def read_from(self, start_offset=0, byte_count=None):
    """Reads the content of this item as text.

    Args:
      start_offset: the start offset of bytes to read.
      byte_count: the number of bytes to read. If None, it reads to the end.
    Returns:
      The text content within the item.
    Raises:
      Exception if there was an error requesting the item's content.
    """
    try:
      return self._api.object_download(self._bucket, self._key,
                                       start_offset=start_offset, byte_count=byte_count)
    except Exception as e:
      raise e



In [15]:
print inspect.getsource(item.read_lines)

  def read_lines(self, max_lines=None):
    """Reads the content of this item as text, and return a list of lines up to some max.

    Args:
      max_lines: max number of lines to return. If None, return all lines.
    Returns:
      The text content of the item as a list of lines.
    Raises:
      Exception if there was an error requesting the item's content.
    """
    if max_lines is None:
      return self.read_from().split('\n')

    max_to_read = self.metadata.size
    bytes_to_read = min(100 * max_lines, self.metadata.size)
    lines = []
    while True:
      content = self.read_from(byte_count=bytes_to_read)

      lines = content.split('\n')
      if len(lines) > max_lines or bytes_to_read >= max_to_read:
        break
      # try 10 times more bytes or max
      bytes_to_read = min(bytes_to_read * 10, max_to_read)

    # remove the partial line at last
    del lines[-1]
    return lines[0:max_lines]



In [12]:
print inspect.getsource(gcp._util.Http.request)

  @staticmethod
  def request(url, args=None, data=None, headers=None, method=None,
              credentials=None, raw_response=False, stats=None):
    """Issues HTTP requests.

    Args:
      url: the URL to request.
      args: optional query string arguments.
      data: optional data to be sent within the request.
      headers: optional headers to include in the request.
      method: optional HTTP method to use. If unspecified this is inferred
          (GET or POST) based on the existence of request data.
      credentials: optional set of credentials to authorize the request.
      raw_response: whether the raw response content should be returned as-is.
      stats: an optional dictionary that, if provided, will be populated with some
          useful info about the request, like 'duration' in seconds and 'data_size' in
          bytes. These may be useful optimizing the access to rate-limited APIs.
    Returns:
      The parsed response object.
    Raises:
      Exception wh

In [None]:
# test data
%storage read --object "gs://yukoga-kaggle/facebook-checkin/test.csv" --variable tmp_table
num_records = len(tmp_table.split('\n'))
test = pd.read_csv(StringIO(tmp_table), skiprows=get_skiprows(test_sample_size, num_records))
del tmp_table

# sample submission data
%storage read --object "gs://yukoga-kaggle/facebook-checkin/sample_submission.csv" --variable tmp_table
num_records = len(tmp_table.split('\n'))
sample_submission = pd.read_csv(StringIO(tmp_table), skiprows=get_skiprows(sample_submission_sample_size, num_records))
del tmp_table

# train data
%storage read --object "gs://yukoga-kaggle/facebook-checkin/train.csv" --variable tmp_table
num_records = len(tmp_table.split('\n'))
train = pd.read_csv(StringIO(tmp_table), skiprows=get_skiprows(train_sample_size, num_records))
del tmp_table

## Appendix. tips for related python program.

### random sampling from list object
```python
mylist = [1,2,3,4,5,6,7,8,9,10]
for idx, val in enumerate(mylist):
  print "{0} : {1}".format(idx, val)
print "================"
for idx, val in enumerate(random.sample(mylist, 3)):
  print "{0} : {1}".format(idx, val)
```

### [load csv from google cloud storage to pandas.DataFrame on Google Datalab](http://stackoverflow.com/questions/37990467/how-can-i-load-my-csv-from-google-datalab-to-a-pandas-data-frame)

```python
import pandas as pd
from StringIO import StringIO

# Read csv file from GCS into a variable
%storage read --object gs://cloud-datalab-samples/cars.csv --variable cars

# Store in a pandas dataframe
df = pd.read_csv(StringIO(cars))
```

### [Read a small random sample from a big CSV file into a Python data frame](http://stackoverflow.com/questions/22258491/read-a-small-random-sample-from-a-big-csv-file-into-a-python-data-frame)
Assuming no header in the CSV file:

```python
import pandas
import random

n = 1000000 #number of records in file
s = 10000 #desired sample size
filename = "data.txt"
skip = sorted(random.sample(xrange(n),n-s))
df = pandas.read_csv(filename, skiprows=skip)
```  

would be better if read_csv had a keeprows, or if skiprows took a callback func instead of a list.

With header and unknown file length:

```python  
import pandas
import random

filename = "data.txt"
n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
s = 10000 #desired sample size
skip = sorted(random.sample(xrange(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pandas.read_csv(filename, skiprows=skip)
```

#### sample code.
```python 
ns = locals()
for item in mybucket.items(prefix='facebook-checkin/', delimiter='/'):
  file_name = item.uri.split('/')[4]
  data_name = file_name.split('.')[0]
  %storage read --object $item.uri --variable tmp_table
  num_records = len(tmp_table.split('\n'))
  sample_size = ns[data_name + "_sample_size"]
  ns[data_name] = pd.read_csv(StringIO(tmp_table), 
                              skiprows=get_skiprows(sample_size, num_records))
  del tmp_table
  print file_name + " : " + str(num_records) + " records."
```