## Extracting Titanic Disaster Data From Kaggle

In [1]:
!pip install python-dotenv

Collecting python-dotenv
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.6.2


You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
from dotenv import load_dotenv, find_dotenv

In [3]:
# find .env automatically by walking up directories until it's found
dotenv_path = find_dotenv()
# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [2]:
# extracting environment variable using os.environ.get
import os
KAGGLE_USERNAME = os.environ.get("KAGGLE_USERNAME")
print(KAGGLE_USERNAME)

None


In [3]:
# imports
import requests
from requests import session
import os
from dotenv import load_dotenv, find_dotenv

In [4]:
# payload for post 
payload = {
    'action': 'login',
    'username': os.environ.get("KAGGLE_USERNAME"),
    'password': os.environ.get("KAGGLE_PASSWORD")
}

# url for train file (get the link from Kaggle website)
url = 'https://www.kaggle.com/c/titanic/download/train.csv'


# setup session
with session() as c:
    # post request
    c.post('https://www.kaggle.com/account/login', data=payload)
    # get request
    response = c.get(url)
    # print response text
    print(response.text)

<!DOCTYPE html>
<html lang="en">
<head>
    <title>Kaggle: Your Home for Data Science</title>
    <meta charset="utf-8" />
    <meta name="robots" content="index, follow" />
    <meta name="turbolinks-cache-control" content="no-cache" />
            <meta name="theme-color" content="#008ABC" />
    <script type="text/javascript">
        window["initialPageLoadStartTime"] = new Date().getTime();
    </script>
    <link rel="dns-prefetch" href="https://www.google-analytics.com" /><link rel="dns-prefetch" href="https://stats.g.doubleclick.net" /><link rel="dns-prefetch" href="https://js.intercomcdn.com" /><link rel="dns-prefetch" href="https://storage.googleapis.com/" />
    <link href="/static/images/favicon.ico" rel="shortcut icon" type="image/x-icon" />
    <link rel="manifest" href="/static/json/manifest.json">
    <link href="//fonts.googleapis.com/css?family=Open+Sans:400,300,300italic,400italic,600,600italic,700,700italic" rel='stylesheet' type='text/css'>
    <link

In [7]:
from requests import session
# payload
payload = {
    'action': 'login',
    'username': os.environ.get("KAGGLE_USERNAME"),
    'password': os.environ.get("KAGGLE_PASSWORD")
}


def extract_data(url, file_path):
    '''
    extract data from kaggle
    '''
    # setup session
    with session() as c:
        c.post('https://www.kaggle.com/account/login', data=payload)
        # oppen file to write
        with open(file_path, 'w') as handle:
            response = c.get(url, stream=True)
            for block in response.iter_content(1024):
                handle.write(block)


In [8]:
# urls
train_url = 'https://www.kaggle.com/c/titanic/download/train.csv'
test_url = 'https://www.kaggle.com/c/titanic/download/test.csv'

# file paths
raw_data_path = os.path.join(os.path.pardir,'data','raw')
train_data_path = os.path.join(raw_data_path, 'train.csv')
test_data_path = os.path.join(raw_data_path, 'test.csv')

# extract data
extract_data(train_url,train_data_path)
extract_data(test_url,test_data_path)

In [9]:
!ls -l ../data/raw

total 96
-rw-r--r-- 1 Dev 197613 29048 Jan 25 19:35 test.csv
-rw-r--r-- 1 Dev 197613 62086 Jan 25 19:34 train.csv


### Builiding the file script

In [10]:
get_raw_data_script_file = os.path.join(os.path.pardir,'src','data','get_raw_data.py')

In [11]:
%%writefile $get_raw_data_script_file
# -*- coding: utf-8 -*-
import os
from dotenv import find_dotenv, load_dotenv
from requests import session
import logging


# payload for login to kaggle
payload = {
    'action': 'login',
    'username': os.environ.get("KAGGLE_USERNAME"),
    'password': os.environ.get("KAGGLE_PASSWORD")
}


def extract_data(url, file_path):
    '''
    method to extract data
    '''
    with session() as c:
        c.post('https://www.kaggle.com/account/login', data=payload)
        with open(file_path, 'w') as handle:
            response = c.get(url, stream=True)
            for block in response.iter_content(1024):
                handle.write(block)


                
def main(project_dir):
    '''
    main method
    '''
    # get logger
    logger = logging.getLogger(__name__)
    logger.info('getting raw data')
    
    # urls
    train_url = 'https://www.kaggle.com/c/titanic/download/train.csv'
    test_url = 'https://www.kaggle.com/c/titanic/download/test.csv'

    # file paths
    raw_data_path = os.path.join(project_dir,'data','raw')
    train_data_path = os.path.join(raw_data_path, 'train.csv')
    test_data_path = os.path.join(raw_data_path, 'test.csv')

    # extract data
    extract_data(train_url,train_data_path)
    extract_data(test_url,test_data_path)
    logger.info('downloaded raw training and test data')


if __name__ == '__main__':
    # getting root directory
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
    
    # setup logger
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    # find .env automatically by walking up directories until it's found
    dotenv_path = find_dotenv()
    # load up the entries as environment variables
    load_dotenv(dotenv_path)

    # call the main
    main(project_dir)


Overwriting ..\src\data\get_raw_data.py


In [12]:
!python $get_raw_data_script_file

2017-01-25 19:44:58,434 - __main__ - INFO - getting raw data
2017-01-25 19:45:11,618 - __main__ - INFO - downloaded raw training and test data
