diff --git a/chierici_practical_part1.ipynb b/chierici_practical_part1.ipynb index 1a9668c..e6e61fd 100644 --- a/chierici_practical_part1.ipynb +++ b/chierici_practical_part1.ipynb @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "colab": {}, "colab_type": "code", @@ -73,9 +73,300 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " var force = true;\n", + "\n", + " if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + " var JS_MIME_TYPE = 'application/javascript';\n", + " var HTML_MIME_TYPE = 'text/html';\n", + " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " var CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " var script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " var cell = handle.cell;\n", + "\n", + " var id = cell.output_area._bokeh_element_id;\n", + " var server_id = cell.output_area._bokeh_server_id;\n", + " // Clean up Bokeh references\n", + " if (id != null && id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " var id = msg.content.text.trim();\n", + " if (id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " var output_area = handle.output_area;\n", + " var output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " var bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " var script_attrs = bk_div.children[0].attributes;\n", + " for (var i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " var toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " var events = require('base/js/events');\n", + " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + "\n", + " \n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " var NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " var el = document.getElementById(\"1002\");\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", + " }\n", + " finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.info(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(js_urls, callback) {\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = js_urls.length;\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " var s = document.createElement('script');\n", + " s.src = url;\n", + " s.async = false;\n", + " s.onreadystatechange = s.onload = function() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", + " run_callbacks()\n", + " }\n", + " };\n", + " s.onerror = function() {\n", + " console.warn(\"failed to load library \" + url);\n", + " };\n", + " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", + " }\n", + " };var element = document.getElementById(\"1002\");\n", + " if (element == null) {\n", + " console.log(\"Bokeh: ERROR: autoload.js configured with elementid '1002' but no matching script tag was found. \")\n", + " return false;\n", + " }\n", + "\n", + " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.0.4.min.js\"];\n", + "\n", + " var inline_js = [\n", + " function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + " \n", + " function(Bokeh) {\n", + " \n", + " },\n", + " function(Bokeh) {\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " \n", + " if ((root.Bokeh !== undefined) || (force === true)) {\n", + " for (var i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }if (force === true) {\n", + " display_loaded();\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " var cell = $(document.getElementById(\"1002\")).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + "\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(js_urls, function() {\n", + " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1002\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n }\n finally {\n delete root._bokeh_onload_callbacks\n }\n console.info(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(js_urls, callback) {\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = js_urls.length;\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var s = document.createElement('script');\n s.src = url;\n s.async = false;\n s.onreadystatechange = s.onload = function() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: all BokehJS libraries loaded\");\n run_callbacks()\n }\n };\n s.onerror = function() {\n console.warn(\"failed to load library \" + url);\n };\n console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.getElementsByTagName(\"head\")[0].appendChild(s);\n }\n };var element = document.getElementById(\"1002\");\n if (element == null) {\n console.log(\"Bokeh: ERROR: autoload.js configured with elementid '1002' but no matching script tag was found. \")\n return false;\n }\n\n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.0.4.min.js\"];\n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n \n function(Bokeh) {\n \n },\n function(Bokeh) {\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n }\n ];\n\n function run_inline_js() {\n \n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1002\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(js_urls, function() {\n console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "output_notebook()" ] @@ -92,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "colab": {}, "colab_type": "code", @@ -106,7 +397,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "colab": {}, "colab_type": "code", @@ -132,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "colab": {}, "colab_type": "code", @@ -161,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -202,28 +493,307 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 4712 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 2047, - "status": "ok", - "timestamp": 1554369595681, - "user": { - "displayName": "Marco Chierici", - "photoUrl": "https://lh6.googleusercontent.com/-1LjDBMGAnW8/AAAAAAAAAAI/AAAAAAAACB0/ScmrJqjZC-4/s64/photo.jpg", - "userId": "06871654247545486268" - }, - "user_tz": -120 - }, - "id": "CoWDeDBl2wHQ", - "outputId": "5a41d67f-63d7-4365-c170-ee04ef5727c1" - }, - "outputs": [], + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function read_csv in module pandas.io.parsers:\n", + "\n", + "read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='\"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)\n", + " Read a comma-separated values (csv) file into DataFrame.\n", + " \n", + " Also supports optionally iterating or breaking of the file\n", + " into chunks.\n", + " \n", + " Additional help can be found in the online docs for\n", + " `IO Tools `_.\n", + " \n", + " Parameters\n", + " ----------\n", + " filepath_or_buffer : str, path object, or file-like object\n", + " Any valid string path is acceptable. The string could be a URL. Valid\n", + " URL schemes include http, ftp, s3, and file. For file URLs, a host is\n", + " expected. A local file could be: file://localhost/path/to/table.csv.\n", + " \n", + " If you want to pass in a path object, pandas accepts either\n", + " ``pathlib.Path`` or ``py._path.local.LocalPath``.\n", + " \n", + " By file-like object, we refer to objects with a ``read()`` method, such as\n", + " a file handler (e.g. via builtin ``open`` function) or ``StringIO``.\n", + " sep : str, default ','\n", + " Delimiter to use. If sep is None, the C engine cannot automatically detect\n", + " the separator, but the Python parsing engine can, meaning the latter will\n", + " be used and automatically detect the separator by Python's builtin sniffer\n", + " tool, ``csv.Sniffer``. In addition, separators longer than 1 character and\n", + " different from ``'\\s+'`` will be interpreted as regular expressions and\n", + " will also force the use of the Python parsing engine. Note that regex\n", + " delimiters are prone to ignoring quoted data. Regex example: ``'\\r\\t'``.\n", + " delimiter : str, default ``None``\n", + " Alias for sep.\n", + " header : int, list of int, default 'infer'\n", + " Row number(s) to use as the column names, and the start of the\n", + " data. Default behavior is to infer the column names: if no names\n", + " are passed the behavior is identical to ``header=0`` and column\n", + " names are inferred from the first line of the file, if column\n", + " names are passed explicitly then the behavior is identical to\n", + " ``header=None``. Explicitly pass ``header=0`` to be able to\n", + " replace existing names. The header can be a list of integers that\n", + " specify row locations for a multi-index on the columns\n", + " e.g. [0,1,3]. Intervening rows that are not specified will be\n", + " skipped (e.g. 2 in this example is skipped). Note that this\n", + " parameter ignores commented lines and empty lines if\n", + " ``skip_blank_lines=True``, so ``header=0`` denotes the first line of\n", + " data rather than the first line of the file.\n", + " names : array-like, optional\n", + " List of column names to use. If file contains no header row, then you\n", + " should explicitly pass ``header=None``. Duplicates in this list will cause\n", + " a ``UserWarning`` to be issued.\n", + " index_col : int, sequence or bool, optional\n", + " Column to use as the row labels of the DataFrame. If a sequence is given, a\n", + " MultiIndex is used. If you have a malformed file with delimiters at the end\n", + " of each line, you might consider ``index_col=False`` to force pandas to\n", + " not use the first column as the index (row names).\n", + " usecols : list-like or callable, optional\n", + " Return a subset of the columns. If list-like, all elements must either\n", + " be positional (i.e. integer indices into the document columns) or strings\n", + " that correspond to column names provided either by the user in `names` or\n", + " inferred from the document header row(s). For example, a valid list-like\n", + " `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.\n", + " Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.\n", + " To instantiate a DataFrame from ``data`` with element order preserved use\n", + " ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns\n", + " in ``['foo', 'bar']`` order or\n", + " ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``\n", + " for ``['bar', 'foo']`` order.\n", + " \n", + " If callable, the callable function will be evaluated against the column\n", + " names, returning names where the callable function evaluates to True. An\n", + " example of a valid callable argument would be ``lambda x: x.upper() in\n", + " ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster\n", + " parsing time and lower memory usage.\n", + " squeeze : bool, default False\n", + " If the parsed data only contains one column then return a Series.\n", + " prefix : str, optional\n", + " Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...\n", + " mangle_dupe_cols : bool, default True\n", + " Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than\n", + " 'X'...'X'. Passing in False will cause data to be overwritten if there\n", + " are duplicate names in the columns.\n", + " dtype : Type name or dict of column -> type, optional\n", + " Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32,\n", + " 'c': 'Int64'}\n", + " Use `str` or `object` together with suitable `na_values` settings\n", + " to preserve and not interpret dtype.\n", + " If converters are specified, they will be applied INSTEAD\n", + " of dtype conversion.\n", + " engine : {'c', 'python'}, optional\n", + " Parser engine to use. The C engine is faster while the python engine is\n", + " currently more feature-complete.\n", + " converters : dict, optional\n", + " Dict of functions for converting values in certain columns. Keys can either\n", + " be integers or column labels.\n", + " true_values : list, optional\n", + " Values to consider as True.\n", + " false_values : list, optional\n", + " Values to consider as False.\n", + " skipinitialspace : bool, default False\n", + " Skip spaces after delimiter.\n", + " skiprows : list-like, int or callable, optional\n", + " Line numbers to skip (0-indexed) or number of lines to skip (int)\n", + " at the start of the file.\n", + " \n", + " If callable, the callable function will be evaluated against the row\n", + " indices, returning True if the row should be skipped and False otherwise.\n", + " An example of a valid callable argument would be ``lambda x: x in [0, 2]``.\n", + " skipfooter : int, default 0\n", + " Number of lines at bottom of file to skip (Unsupported with engine='c').\n", + " nrows : int, optional\n", + " Number of rows of file to read. Useful for reading pieces of large files.\n", + " na_values : scalar, str, list-like, or dict, optional\n", + " Additional strings to recognize as NA/NaN. If dict passed, specific\n", + " per-column NA values. By default the following values are interpreted as\n", + " NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',\n", + " '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan',\n", + " 'null'.\n", + " keep_default_na : bool, default True\n", + " Whether or not to include the default NaN values when parsing the data.\n", + " Depending on whether `na_values` is passed in, the behavior is as follows:\n", + " \n", + " * If `keep_default_na` is True, and `na_values` are specified, `na_values`\n", + " is appended to the default NaN values used for parsing.\n", + " * If `keep_default_na` is True, and `na_values` are not specified, only\n", + " the default NaN values are used for parsing.\n", + " * If `keep_default_na` is False, and `na_values` are specified, only\n", + " the NaN values specified `na_values` are used for parsing.\n", + " * If `keep_default_na` is False, and `na_values` are not specified, no\n", + " strings will be parsed as NaN.\n", + " \n", + " Note that if `na_filter` is passed in as False, the `keep_default_na` and\n", + " `na_values` parameters will be ignored.\n", + " na_filter : bool, default True\n", + " Detect missing value markers (empty strings and the value of na_values). In\n", + " data without any NAs, passing na_filter=False can improve the performance\n", + " of reading a large file.\n", + " verbose : bool, default False\n", + " Indicate number of NA values placed in non-numeric columns.\n", + " skip_blank_lines : bool, default True\n", + " If True, skip over blank lines rather than interpreting as NaN values.\n", + " parse_dates : bool or list of int or names or list of lists or dict, default False\n", + " The behavior is as follows:\n", + " \n", + " * boolean. If True -> try parsing the index.\n", + " * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3\n", + " each as a separate date column.\n", + " * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as\n", + " a single date column.\n", + " * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call\n", + " result 'foo'\n", + " \n", + " If a column or index cannot be represented as an array of datetimes,\n", + " say because of an unparseable value or a mixture of timezones, the column\n", + " or index will be returned unaltered as an object data type. For\n", + " non-standard datetime parsing, use ``pd.to_datetime`` after\n", + " ``pd.read_csv``. To parse an index or column with a mixture of timezones,\n", + " specify ``date_parser`` to be a partially-applied\n", + " :func:`pandas.to_datetime` with ``utc=True``. See\n", + " :ref:`io.csv.mixed_timezones` for more.\n", + " \n", + " Note: A fast-path exists for iso8601-formatted dates.\n", + " infer_datetime_format : bool, default False\n", + " If True and `parse_dates` is enabled, pandas will attempt to infer the\n", + " format of the datetime strings in the columns, and if it can be inferred,\n", + " switch to a faster method of parsing them. In some cases this can increase\n", + " the parsing speed by 5-10x.\n", + " keep_date_col : bool, default False\n", + " If True and `parse_dates` specifies combining multiple columns then\n", + " keep the original columns.\n", + " date_parser : function, optional\n", + " Function to use for converting a sequence of string columns to an array of\n", + " datetime instances. The default uses ``dateutil.parser.parser`` to do the\n", + " conversion. Pandas will try to call `date_parser` in three different ways,\n", + " advancing to the next if an exception occurs: 1) Pass one or more arrays\n", + " (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the\n", + " string values from the columns defined by `parse_dates` into a single array\n", + " and pass that; and 3) call `date_parser` once for each row using one or\n", + " more strings (corresponding to the columns defined by `parse_dates`) as\n", + " arguments.\n", + " dayfirst : bool, default False\n", + " DD/MM format dates, international and European format.\n", + " iterator : bool, default False\n", + " Return TextFileReader object for iteration or getting chunks with\n", + " ``get_chunk()``.\n", + " chunksize : int, optional\n", + " Return TextFileReader object for iteration.\n", + " See the `IO Tools docs\n", + " `_\n", + " for more information on ``iterator`` and ``chunksize``.\n", + " compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'\n", + " For on-the-fly decompression of on-disk data. If 'infer' and\n", + " `filepath_or_buffer` is path-like, then detect compression from the\n", + " following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no\n", + " decompression). If using 'zip', the ZIP file must contain only one data\n", + " file to be read in. Set to None for no decompression.\n", + " \n", + " .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.\n", + " \n", + " thousands : str, optional\n", + " Thousands separator.\n", + " decimal : str, default '.'\n", + " Character to recognize as decimal point (e.g. use ',' for European data).\n", + " lineterminator : str (length 1), optional\n", + " Character to break file into lines. Only valid with C parser.\n", + " quotechar : str (length 1), optional\n", + " The character used to denote the start and end of a quoted item. Quoted\n", + " items can include the delimiter and it will be ignored.\n", + " quoting : int or csv.QUOTE_* instance, default 0\n", + " Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of\n", + " QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).\n", + " doublequote : bool, default ``True``\n", + " When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate\n", + " whether or not to interpret two consecutive quotechar elements INSIDE a\n", + " field as a single ``quotechar`` element.\n", + " escapechar : str (length 1), optional\n", + " One-character string used to escape other characters.\n", + " comment : str, optional\n", + " Indicates remainder of line should not be parsed. If found at the beginning\n", + " of a line, the line will be ignored altogether. This parameter must be a\n", + " single character. Like empty lines (as long as ``skip_blank_lines=True``),\n", + " fully commented lines are ignored by the parameter `header` but not by\n", + " `skiprows`. For example, if ``comment='#'``, parsing\n", + " ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being\n", + " treated as the header.\n", + " encoding : str, optional\n", + " Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python\n", + " standard encodings\n", + " `_ .\n", + " dialect : str or csv.Dialect, optional\n", + " If provided, this parameter will override values (default or not) for the\n", + " following parameters: `delimiter`, `doublequote`, `escapechar`,\n", + " `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to\n", + " override values, a ParserWarning will be issued. See csv.Dialect\n", + " documentation for more details.\n", + " tupleize_cols : bool, default False\n", + " Leave a list of tuples on columns as is (default is to convert to\n", + " a MultiIndex on the columns).\n", + " \n", + " .. deprecated:: 0.21.0\n", + " This argument will be removed and will always convert to MultiIndex\n", + " \n", + " error_bad_lines : bool, default True\n", + " Lines with too many fields (e.g. a csv line with too many commas) will by\n", + " default cause an exception to be raised, and no DataFrame will be returned.\n", + " If False, then these \"bad lines\" will dropped from the DataFrame that is\n", + " returned.\n", + " warn_bad_lines : bool, default True\n", + " If error_bad_lines is False, and warn_bad_lines is True, a warning for each\n", + " \"bad line\" will be output.\n", + " delim_whitespace : bool, default False\n", + " Specifies whether or not whitespace (e.g. ``' '`` or ``' '``) will be\n", + " used as the sep. Equivalent to setting ``sep='\\s+'``. If this option\n", + " is set to True, nothing should be passed in for the ``delimiter``\n", + " parameter.\n", + " \n", + " .. versionadded:: 0.18.1 support for the Python parser.\n", + " \n", + " low_memory : bool, default True\n", + " Internally process the file in chunks, resulting in lower memory use\n", + " while parsing, but possibly mixed type inference. To ensure no mixed\n", + " types either set False, or specify the type with the `dtype` parameter.\n", + " Note that the entire file is read into a single DataFrame regardless,\n", + " use the `chunksize` or `iterator` parameter to return the data in chunks.\n", + " (Only valid with C parser).\n", + " memory_map : bool, default False\n", + " If a filepath is provided for `filepath_or_buffer`, map the file object\n", + " directly onto memory and access the data directly from there. Using this\n", + " option can improve performance because there is no longer any I/O overhead.\n", + " float_precision : str, optional\n", + " Specifies which converter the C engine should use for floating-point\n", + " values. The options are `None` for the ordinary converter,\n", + " `high` for the high-precision converter, and `round_trip` for the\n", + " round-trip converter.\n", + " \n", + " Returns\n", + " -------\n", + " DataFrame or TextParser\n", + " A comma-separated values (csv) file is returned as two-dimensional\n", + " data structure with labeled axes.\n", + " \n", + " See Also\n", + " --------\n", + " to_csv : Write DataFrame to a comma-separated values (csv) file.\n", + " read_csv : Read a comma-separated values (csv) file into DataFrame.\n", + " read_fwf : Read a table of fixed-width formatted lines into DataFrame.\n", + " \n", + " Examples\n", + " --------\n", + " >>> pd.read_csv('data.csv') # doctest: +SKIP\n", + "\n" + ] + } + ], "source": [ "help(pd.read_csv)" ] @@ -240,14 +810,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "colab": {}, "colab_type": "code", "id": "2iltS1Q-k3Wn", "outputId": "ea81462d-8a49-406b-c933-182c49379053" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(136, 52230)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data_tr.shape" ] @@ -266,14 +847,242 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "colab": {}, "colab_type": "code", "id": "TVfPwU6-k3Wt", "outputId": "884dd460-6c53-4bf4-9c37-f7c7299b37a2" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sampleIDALB.Gene_AceViewCD24L4.1.Gene_AceViewRPS11.Gene_RefSeqRPS18.Gene_AceViewC5orf13.Gene_AceViewCCT2.Gene_AceViewCOL1A1.Gene_AceViewDDX1.Gene_AceViewEEF1A1.Gene_AceView...zawskaw.Gene_AceViewzeedor.Gene_AceViewzergor.Gene_AceViewzorsa.Gene_AceViewzoychabu.Gene_AceViewzoysteeby.Gene_AceViewzudee.Gene_AceViewzureyby.Gene_AceViewzuswoybu.Gene_AceViewzyjee.Gene_AceView
0SEQC_NB0019.2918.8221.1720.9020.0216.3118.6015.7321.71...0.00.000.00.000.00.00.000.00.00.0
1SEQC_NB0039.2520.2522.4422.0021.0517.0619.3922.8422.72...0.05.540.03.390.00.05.450.00.00.0
2SEQC_NB0058.9920.0922.0921.7121.6516.8523.0215.7922.24...0.00.000.03.750.00.00.000.00.00.0
3SEQC_NB0117.3219.8220.5220.9021.5816.4918.9115.4522.06...0.00.000.00.000.00.00.000.00.00.0
4SEQC_NB01310.5621.1920.6921.2920.2816.2217.1516.0121.84...0.00.000.05.200.00.00.000.00.00.0
\n", + "

5 rows × 52230 columns

\n", + "
" + ], + "text/plain": [ + " sampleID ALB.Gene_AceView CD24L4.1.Gene_AceView RPS11.Gene_RefSeq \\\n", + "0 SEQC_NB001 9.29 18.82 21.17 \n", + "1 SEQC_NB003 9.25 20.25 22.44 \n", + "2 SEQC_NB005 8.99 20.09 22.09 \n", + "3 SEQC_NB011 7.32 19.82 20.52 \n", + "4 SEQC_NB013 10.56 21.19 20.69 \n", + "\n", + " RPS18.Gene_AceView C5orf13.Gene_AceView CCT2.Gene_AceView \\\n", + "0 20.90 20.02 16.31 \n", + "1 22.00 21.05 17.06 \n", + "2 21.71 21.65 16.85 \n", + "3 20.90 21.58 16.49 \n", + "4 21.29 20.28 16.22 \n", + "\n", + " COL1A1.Gene_AceView DDX1.Gene_AceView EEF1A1.Gene_AceView ... \\\n", + "0 18.60 15.73 21.71 ... \n", + "1 19.39 22.84 22.72 ... \n", + "2 23.02 15.79 22.24 ... \n", + "3 18.91 15.45 22.06 ... \n", + "4 17.15 16.01 21.84 ... \n", + "\n", + " zawskaw.Gene_AceView zeedor.Gene_AceView zergor.Gene_AceView \\\n", + "0 0.0 0.00 0.0 \n", + "1 0.0 5.54 0.0 \n", + "2 0.0 0.00 0.0 \n", + "3 0.0 0.00 0.0 \n", + "4 0.0 0.00 0.0 \n", + "\n", + " zorsa.Gene_AceView zoychabu.Gene_AceView zoysteeby.Gene_AceView \\\n", + "0 0.00 0.0 0.0 \n", + "1 3.39 0.0 0.0 \n", + "2 3.75 0.0 0.0 \n", + "3 0.00 0.0 0.0 \n", + "4 5.20 0.0 0.0 \n", + "\n", + " zudee.Gene_AceView zureyby.Gene_AceView zuswoybu.Gene_AceView \\\n", + "0 0.00 0.0 0.0 \n", + "1 5.45 0.0 0.0 \n", + "2 0.00 0.0 0.0 \n", + "3 0.00 0.0 0.0 \n", + "4 0.00 0.0 0.0 \n", + "\n", + " zyjee.Gene_AceView \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "\n", + "[5 rows x 52230 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data_tr.head()" ] @@ -290,13 +1099,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "colab": {}, "colab_type": "code", "id": "F1bUOeE4k3Wx" }, - "outputs": [], + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['sampleID'] not found in axis\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata_tr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_tr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sampleID'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdata_ts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_ts\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sampleID'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 3938\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3939\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3940\u001b[0;31m errors=errors)\n\u001b[0m\u001b[1;32m 3941\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3942\u001b[0m @rewrite_axis_style_signature('mapper', [('copy', True),\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 3778\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32min\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3779\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3780\u001b[0;31m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_drop_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3781\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3782\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_drop_axis\u001b[0;34m(self, labels, axis, level, errors)\u001b[0m\n\u001b[1;32m 3810\u001b[0m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3811\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3812\u001b[0;31m \u001b[0mnew_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3813\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0maxis_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnew_axis\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mdrop\u001b[0;34m(self, labels, errors)\u001b[0m\n\u001b[1;32m 4963\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'ignore'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4964\u001b[0m raise KeyError(\n\u001b[0;32m-> 4965\u001b[0;31m '{} not found in axis'.format(labels[mask]))\n\u001b[0m\u001b[1;32m 4966\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m~\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4967\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: \"['sampleID'] not found in axis\"" + ] + } + ], "source": [ "data_tr = data_tr.drop('sampleID', axis=1)\n", "data_ts = data_ts.drop('sampleID', axis=1)" @@ -314,14 +1139,242 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "colab": {}, "colab_type": "code", "id": "QgcQgBVek3W1", "outputId": "6cbec2e0-0001-4e03-c040-0bbddd51db5b" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ALB.Gene_AceViewCD24L4.1.Gene_AceViewRPS11.Gene_RefSeqRPS18.Gene_AceViewC5orf13.Gene_AceViewCCT2.Gene_AceViewCOL1A1.Gene_AceViewDDX1.Gene_AceViewEEF1A1.Gene_AceViewFLT3LG_.Gene_AceView...zawskaw.Gene_AceViewzeedor.Gene_AceViewzergor.Gene_AceViewzorsa.Gene_AceViewzoychabu.Gene_AceViewzoysteeby.Gene_AceViewzudee.Gene_AceViewzureyby.Gene_AceViewzuswoybu.Gene_AceViewzyjee.Gene_AceView
09.2918.8221.1720.9020.0216.3118.6015.7321.7120.02...0.00.000.00.000.00.00.000.00.00.0
19.2520.2522.4422.0021.0517.0619.3922.8422.7221.26...0.05.540.03.390.00.05.450.00.00.0
28.9920.0922.0921.7121.6516.8523.0215.7922.2420.75...0.00.000.03.750.00.00.000.00.00.0
37.3219.8220.5220.9021.5816.4918.9115.4522.0619.59...0.00.000.00.000.00.00.000.00.00.0
410.5621.1920.6921.2920.2816.2217.1516.0121.8419.74...0.00.000.05.200.00.00.000.00.00.0
\n", + "

5 rows × 52229 columns

\n", + "
" + ], + "text/plain": [ + " ALB.Gene_AceView CD24L4.1.Gene_AceView RPS11.Gene_RefSeq \\\n", + "0 9.29 18.82 21.17 \n", + "1 9.25 20.25 22.44 \n", + "2 8.99 20.09 22.09 \n", + "3 7.32 19.82 20.52 \n", + "4 10.56 21.19 20.69 \n", + "\n", + " RPS18.Gene_AceView C5orf13.Gene_AceView CCT2.Gene_AceView \\\n", + "0 20.90 20.02 16.31 \n", + "1 22.00 21.05 17.06 \n", + "2 21.71 21.65 16.85 \n", + "3 20.90 21.58 16.49 \n", + "4 21.29 20.28 16.22 \n", + "\n", + " COL1A1.Gene_AceView DDX1.Gene_AceView EEF1A1.Gene_AceView \\\n", + "0 18.60 15.73 21.71 \n", + "1 19.39 22.84 22.72 \n", + "2 23.02 15.79 22.24 \n", + "3 18.91 15.45 22.06 \n", + "4 17.15 16.01 21.84 \n", + "\n", + " FLT3LG_.Gene_AceView ... zawskaw.Gene_AceView zeedor.Gene_AceView \\\n", + "0 20.02 ... 0.0 0.00 \n", + "1 21.26 ... 0.0 5.54 \n", + "2 20.75 ... 0.0 0.00 \n", + "3 19.59 ... 0.0 0.00 \n", + "4 19.74 ... 0.0 0.00 \n", + "\n", + " zergor.Gene_AceView zorsa.Gene_AceView zoychabu.Gene_AceView \\\n", + "0 0.0 0.00 0.0 \n", + "1 0.0 3.39 0.0 \n", + "2 0.0 3.75 0.0 \n", + "3 0.0 0.00 0.0 \n", + "4 0.0 5.20 0.0 \n", + "\n", + " zoysteeby.Gene_AceView zudee.Gene_AceView zureyby.Gene_AceView \\\n", + "0 0.0 0.00 0.0 \n", + "1 0.0 5.45 0.0 \n", + "2 0.0 0.00 0.0 \n", + "3 0.0 0.00 0.0 \n", + "4 0.0 0.00 0.0 \n", + "\n", + " zuswoybu.Gene_AceView zyjee.Gene_AceView \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + "[5 rows x 52229 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data_tr.head()" ] @@ -338,14 +1391,95 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "colab": {}, "colab_type": "code", "id": "7Vbq9mqXk3W5", "outputId": "6bc4f986-30e2-4953-fc78-80b80982ba47" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sampleIDCLASSSEXRND
0SEQC_NB001011
1SEQC_NB003000
2SEQC_NB005001
3SEQC_NB011111
4SEQC_NB013011
\n", + "
" + ], + "text/plain": [ + " sampleID CLASS SEX RND\n", + "0 SEQC_NB001 0 1 1\n", + "1 SEQC_NB003 0 0 0\n", + "2 SEQC_NB005 0 0 1\n", + "3 SEQC_NB011 1 1 1\n", + "4 SEQC_NB013 0 1 1" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "labs_tr = pd.read_csv(LABS_TR, sep = \"\\t\")\n", "labs_ts = pd.read_csv(LABS_TS, sep = \"\\t\")\n", @@ -364,14 +1498,77 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "colab": {}, "colab_type": "code", "id": "pyTfzujJk3W9", "outputId": "cd7cf62a-c5b1-491a-853e-631b5cc9a4d2" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLASS
00
10
20
31
40
\n", + "
" + ], + "text/plain": [ + " CLASS\n", + "0 0\n", + "1 0\n", + "2 0\n", + "3 1\n", + "4 0" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "class_lab_tr = labs_tr[['CLASS']]\n", "class_lab_ts = labs_ts[['CLASS']]\n", @@ -391,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": { "colab": {}, "colab_type": "code", @@ -438,13 +1635,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": { "colab": {}, "colab_type": "code", "id": "XYs3b6JJ3qrn" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 9.29, 18.82, 21.17, ..., 0. , 0. , 0. ],\n", + " [ 9.25, 20.25, 22.44, ..., 0. , 0. , 0. ],\n", + " [ 8.99, 20.09, 22.09, ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [ 8.47, 20.75, 20.08, ..., 0. , 0. , 0. ],\n", + " [ 8.58, 20.57, 20.67, ..., 0. , 0. , 0. ],\n", + " [ 8.62, 20.13, 21.04, ..., 0. , 0. , 0. ]])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "x_tr" ] @@ -461,13 +1675,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": { "colab": {}, "colab_type": "code", "id": "iwI8uSvC4BbC" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,\n", + " 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,\n", + " 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,\n", + " 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,\n", + " 1, 1, 1, 1])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "y_tr" ] @@ -512,7 +1743,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -566,7 +1797,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": { "colab": {}, "colab_type": "code", @@ -592,7 +1823,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": { "colab": {}, "colab_type": "code", @@ -605,7 +1836,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -615,9 +1846,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(136, 2)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "z_tr.shape" ] @@ -634,13 +1876,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": { "colab": {}, "colab_type": "code", "id": "Cma7FaOd6F1M" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.08194369 0.0433671 ]\n" + ] + } + ], "source": [ "print(pca.explained_variance_ratio_)" ] @@ -669,9 +1919,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"45fb7df9-eb17-44c9-8462-0a303c59d7ab\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1014\",\"type\":\"LinearAxis\"}],\"left\":[{\"id\":\"1019\",\"type\":\"LinearAxis\"}],\"plot_height\":400,\"plot_width\":400,\"renderers\":[{\"id\":\"1014\",\"type\":\"LinearAxis\"},{\"id\":\"1018\",\"type\":\"Grid\"},{\"id\":\"1019\",\"type\":\"LinearAxis\"},{\"id\":\"1023\",\"type\":\"Grid\"},{\"id\":\"1032\",\"type\":\"BoxAnnotation\"},{\"id\":\"1042\",\"type\":\"GlyphRenderer\"},{\"id\":\"1047\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1003\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1030\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1006\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"1010\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"1008\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"1012\",\"type\":\"LinearScale\"}},\"id\":\"1004\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"overlay\":{\"id\":\"1032\",\"type\":\"BoxAnnotation\"}},\"id\":\"1026\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1029\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1020\",\"type\":\"BasicTicker\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"IurCsU7yN8CdLu7HIaI7wKqB4dmB8inALZtzfTArOcAjuG5zn/s1wIuSWeYBRS3ArB2Npd4uPcDx14+AtxMowEN3NvPDKDbARI/svWD5GsCn0SpexPI5wD9v02O5nzjAFH+LW7TxKsCwLI6AJBQ2wIB7KbWDFz/AzwBzQlmcNcCbDHrxHbs5wHXq04W4TTjAg91C5ZcsuD/SEmYIVFo3wJqNhc061zTAfHZYEjbTN8B9i9QawY04wAG2Nuq6Lj3ApD83RnLkPsBwgEoceeAiwDbIEb3cYTvA0ykBw9KSN8C8itoviAQOwNiPvqMqczXASEapUyE4QMBgt0n8o7cowCh6n1DOkTHAtjl0S6TANMAPjZy7ixg7wNFfHVf34TPANy16NwS0NcAQxoNt3l4nwCPEAbWmnPu/TnOlAd9Z/b+WFE2KrjkwwFAkzbh3kD7AKI4gmMlQMsBgCN8o/xAwwHAW7OGIEBzAsSBFvM7sPcCIY64wzqMtwLAru1I0BCNAZY89ltNpAsDgkxBsB2MuwJra3owgizfA9sMBac9/PkBfADZ4NyE9wDo/z0NpRzjA8C9D0NovOcAFoSfkfvhBwClHFvDuDzDAJH3857tjMcCIKliPocoCQJMunEX7iSnAdiOoLN/+EsBKX1IR7toHQJGsRuByIzHA1P0dIOmrIsAt7GMG8oM1wBbwVZI57DnABHYnZ7eYJcC3kzYfz7IrwOBe+MUAfTTAlHFFOxlAPMB8rCeB/i01wAyclbWUkzLAwSfVDHegREA4+1klqCguwJzOZSahYjvAL7+ZKHDMM8ByoK0LUkYVwAKdaz96LDTAvxOmzT7+HsA7ydJTmOw1wEuFdl2sOCfA/3QV2DgkMsDZrSwfl9A3wOZyJXEaRTrAI/z5uMo6JEAtDRKbi6IywCRJ70sH8QNAUMjHtC/uOsDqEpjqRDIVwPPHo27M7TPAkkr0le7sL8A=\",\"dtype\":\"float64\",\"shape\":[91]},\"y\":{\"__ndarray__\":\"XlkoU+I7IUAip2MwfhEvwEejX4Lb7ynAClHGpwYSSkC9PrXPLT4YQK7ADd8okzZAWI4Wre2AHkALIDoUNEgYQLa3tFyuZCVA/NavUkQLNMAWhp1tW/wiQCmiSpcACyXAnAlo1erxJcBoPGuI6I8ywE1e677hGzLAS5rAV21Yjj/hdP8CuN9BQD/n4H7lJzbApYSa4iGKRsBiBEZ/xwQzwDAYSxEzmyrAg7K/8WLtLMBl3WzrRHQqwMhKtmkwKzBACnH5qeNk+T+NrlLQHSY7wKmHsp3aMDtAYAgAyPNrMkChs78VvplAwMXoZcs0twJAnc8Fn16vK0DtTeg6do00wJGMVVu3PzXA5R9L2E1kN8AYajTDRmUMwJK6ufvX9B3AwP3rG1TXK0D+I3CzpRVCwDKRAJk9ah/AvzQtPsUrhj9uOyKWVwwhwM29PErcOCpAas//BuV0TUDNV16ejPMtwKYf4Ar+CTbABwWNTOqaGUCE/YB3UhgvwDB+HSu8ry/AIuhMYiHeIUCjkw+uqhYhwLsVuP0VLDHA/P/bcs8YM0B1f1twm1YWwOiCJENx6jPAtahTRuBqK8B5vSGQtNQqwGRrtrWIFSHAB+3SLs29KsDDbsucf7cswPfgza8mJCjArKXztWoAQEDQltyMgrIUQEjjEADIEzfAZbMMZav0OcDNOm95mhwswGBfH61wazLA0lBCumSOJsD7nI8CGpodQO5DYrccdytAroVbpG7gJEAhajwd9FIDQFNJCkk1GCfANEyrQX0RI8BTnRQZDCEWQC35iyEuu0BAtkY4sthtNUDWsMwjYqJAwOtxGyAYXxrADDP7WzqS979Hz1GC+7HJvxcFKZK0uSjA8WFwNIJSO0C+LOu0eD4lwJVJhohB1zJAYJYeKbM1N8DOHrikG5I2wOzpr25+rTDAzBVdoWSwBsCJ1q0ySyYpwDrm4ym+ryNAdZmXJGPaJkA=\",\"dtype\":\"float64\",\"shape\":[91]}},\"selected\":{\"id\":\"1056\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1057\",\"type\":\"UnionRenderers\"}},\"id\":\"1044\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"PC1\",\"formatter\":{\"id\":\"1050\",\"type\":\"BasicTickFormatter\"},\"plot\":{\"id\":\"1004\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1015\",\"type\":\"BasicTicker\"}},\"id\":\"1014\",\"type\":\"LinearAxis\"},{\"attributes\":{\"plot\":null,\"text\":\"PCA of Train data\"},\"id\":\"1003\",\"type\":\"Title\"},{\"attributes\":{\"dimension\":1,\"plot\":{\"id\":\"1004\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1020\",\"type\":\"BasicTicker\"}},\"id\":\"1023\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1010\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1039\",\"type\":\"ColumnDataSource\"}},\"id\":\"1043\",\"type\":\"CDSView\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1046\",\"type\":\"Circle\"},{\"attributes\":{\"data_source\":{\"id\":\"1044\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1045\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1046\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1048\",\"type\":\"CDSView\"}},\"id\":\"1047\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"callback\":null},\"id\":\"1006\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1012\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1044\",\"type\":\"ColumnDataSource\"}},\"id\":\"1048\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1050\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1027\",\"type\":\"SaveTool\"},{\"attributes\":{},\"id\":\"1028\",\"type\":\"ResetTool\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1024\",\"type\":\"PanTool\"},{\"id\":\"1025\",\"type\":\"WheelZoomTool\"},{\"id\":\"1026\",\"type\":\"BoxZoomTool\"},{\"id\":\"1027\",\"type\":\"SaveTool\"},{\"id\":\"1028\",\"type\":\"ResetTool\"},{\"id\":\"1029\",\"type\":\"HelpTool\"}]},\"id\":\"1030\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1052\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"1039\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1040\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1041\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1043\",\"type\":\"CDSView\"}},\"id\":\"1042\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1024\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1054\",\"type\":\"Selection\"},{\"attributes\":{\"plot\":{\"id\":\"1004\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1015\",\"type\":\"BasicTicker\"}},\"id\":\"1018\",\"type\":\"Grid\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"1X9C4JTUFsAPMQZdMXtMQEpAIGo+z0dAcpvnhkTmKMB/LnH9hfAvQISwi4ZRfeQ/xP5nDbXJRUCF3SBUeoQzQKHPFkfyPkdAlIay/JnpUUD5goOERC9SQKbbtQt0GfW/jwWzTntZJEDpbAKvtBVJQOh6DKTIIlNAl1KdzTFbQkDtQFDvngFMQOe8CKR7gUJAHeD56j+NPkBidvLaoYstwJ8kPOtnvxtAdz/h7tjzFEA4qAX1hXFOQKby/k4sDk1As4MDCMldOEDmu6DUxFA0QOHh5wvu9VFA0NVg9y4xN0CDBAp6Gkc4QKFHQsaY3EFAlrcAsmXUSUBqCHHahFE5QGKqumJBkSHAnXlL1nTsDkBSRmeCHag1QOPe5ly720JA1rP7uaLgREBWZFck068tQHcOO0ATFlRAwY25Hb7+SEA2G16SRofxvylcl8EucSNAzuxD5JNKUECtKauKvAxQQHf42lRFIlNA\",\"dtype\":\"float64\",\"shape\":[45]},\"y\":{\"__ndarray__\":\"21TAGVIYNEAxN/XrrFw2wCLo2kvHBgHA+QbwSvt/NMBLDKu2xZIJwEFBMUbrxRZAmWgzf1KfIcCgk+GrdME3wA0+1TaJphhAo7TQ8QLXz78lfn3M2uw9wHt9IYgNc0JAQ+4mGIhNL0C3KifR9vEoQJMUaUZIn0PArE8xUjVBNkB7gnvaefk1wE4TaCGepChA9t9o5q20F8CLsXIB8IEowD0ilrTKByRAt/WGBsnEQsAJmrBHoulFQOS2fxWGVzTA9VFhU3BDFkBkvugxVO1BQD/2w55MqgrA84qLxKXaTUAXHRzTiDQ8QM8pTBhtbjNAxeY1wUCaNUBDNiCG7hFGQNvt2ZamKUNAPmN4hXMpSUDCAG3k8AI1QLE2qtErjyxAKZ1OUt0xRUCn3Rl4s5lDQNzKy+j5hxVALaPZhE4LQMC39eFimkk/QKAUzSSpIChAqHpuETg0OcDggg/wbaY2wHOxrRFm/hjA\",\"dtype\":\"float64\",\"shape\":[45]}},\"selected\":{\"id\":\"1054\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1055\",\"type\":\"UnionRenderers\"}},\"id\":\"1039\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1055\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"plot\":null,\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1032\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1025\",\"type\":\"WheelZoomTool\"},{\"attributes\":{},\"id\":\"1056\",\"type\":\"Selection\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1041\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1057\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_color\":{\"value\":\"orange\"},\"line_color\":{\"value\":\"orange\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1040\",\"type\":\"Circle\"},{\"attributes\":{\"axis_label\":\"PC2\",\"formatter\":{\"id\":\"1052\",\"type\":\"BasicTickFormatter\"},\"plot\":{\"id\":\"1004\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1020\",\"type\":\"BasicTicker\"}},\"id\":\"1019\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1015\",\"type\":\"BasicTicker\"},{\"attributes\":{\"callback\":null},\"id\":\"1008\",\"type\":\"DataRange1d\"},{\"attributes\":{\"fill_color\":{\"value\":\"blue\"},\"line_color\":{\"value\":\"blue\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1045\",\"type\":\"Circle\"}],\"root_ids\":[\"1004\"]},\"title\":\"Bokeh Application\",\"version\":\"1.0.4\"}};\n", + " var render_items = [{\"docid\":\"45fb7df9-eb17-44c9-8462-0a303c59d7ab\",\"roots\":{\"1004\":\"bd995555-6bec-4589-bfdc-ec8629e934e1\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " clearInterval(timer);\n", + " }\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " clearInterval(timer);\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1004" + } + }, + "output_type": "display_data" + } + ], "source": [ "p = figure(plot_width=400, plot_height=400, title=\"PCA of Train data\")\n", "p.circle(z_tr[y_tr==0, 0], z_tr[y_tr==0, 1], line_color=\"orange\", fill_color=\"orange\")\n", @@ -693,9 +1997,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"f57e77d2-50b6-4fe8-9b68-73bc176d8aa4\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1130\",\"type\":\"LinearAxis\"}],\"left\":[{\"id\":\"1135\",\"type\":\"LinearAxis\"}],\"plot_height\":400,\"plot_width\":400,\"renderers\":[{\"id\":\"1130\",\"type\":\"LinearAxis\"},{\"id\":\"1134\",\"type\":\"Grid\"},{\"id\":\"1135\",\"type\":\"LinearAxis\"},{\"id\":\"1139\",\"type\":\"Grid\"},{\"id\":\"1148\",\"type\":\"BoxAnnotation\"},{\"id\":\"1158\",\"type\":\"GlyphRenderer\"},{\"id\":\"1163\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1119\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1146\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1122\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"1126\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"1124\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"1128\",\"type\":\"LinearScale\"}},\"id\":\"1120\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1126\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1155\",\"type\":\"ColumnDataSource\"}},\"id\":\"1159\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1181\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1128\",\"type\":\"LinearScale\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"IurCsU7yN8CdLu7HIaI7wKqB4dmB8inALZtzfTArOcAjuG5zn/s1wIuSWeYBRS3ArB2Npd4uPcDx14+AtxMowEN3NvPDKDbARI/svWD5GsCn0SpexPI5wD9v02O5nzjAFH+LW7TxKsCwLI6AJBQ2wIB7KbWDFz/AzwBzQlmcNcCbDHrxHbs5wHXq04W4TTjAg91C5ZcsuD/SEmYIVFo3wJqNhc061zTAfHZYEjbTN8B9i9QawY04wAG2Nuq6Lj3ApD83RnLkPsBwgEoceeAiwDbIEb3cYTvA0ykBw9KSN8C8itoviAQOwNiPvqMqczXASEapUyE4QMBgt0n8o7cowCh6n1DOkTHAtjl0S6TANMAPjZy7ixg7wNFfHVf34TPANy16NwS0NcAQxoNt3l4nwCPEAbWmnPu/TnOlAd9Z/b+WFE2KrjkwwFAkzbh3kD7AKI4gmMlQMsBgCN8o/xAwwHAW7OGIEBzAsSBFvM7sPcCIY64wzqMtwLAru1I0BCNAZY89ltNpAsDgkxBsB2MuwJra3owgizfA9sMBac9/PkBfADZ4NyE9wDo/z0NpRzjA8C9D0NovOcAFoSfkfvhBwClHFvDuDzDAJH3857tjMcCIKliPocoCQJMunEX7iSnAdiOoLN/+EsBKX1IR7toHQJGsRuByIzHA1P0dIOmrIsAt7GMG8oM1wBbwVZI57DnABHYnZ7eYJcC3kzYfz7IrwOBe+MUAfTTAlHFFOxlAPMB8rCeB/i01wAyclbWUkzLAwSfVDHegREA4+1klqCguwJzOZSahYjvAL7+ZKHDMM8ByoK0LUkYVwAKdaz96LDTAvxOmzT7+HsA7ydJTmOw1wEuFdl2sOCfA/3QV2DgkMsDZrSwfl9A3wOZyJXEaRTrAI/z5uMo6JEAtDRKbi6IywCRJ70sH8QNAUMjHtC/uOsDqEpjqRDIVwPPHo27M7TPAkkr0le7sL8A=\",\"dtype\":\"float64\",\"shape\":[91]},\"y\":{\"__ndarray__\":\"XlkoU+I7IUAip2MwfhEvwEejX4Lb7ynAClHGpwYSSkC9PrXPLT4YQK7ADd8okzZAWI4Wre2AHkALIDoUNEgYQLa3tFyuZCVA/NavUkQLNMAWhp1tW/wiQCmiSpcACyXAnAlo1erxJcBoPGuI6I8ywE1e677hGzLAS5rAV21Yjj/hdP8CuN9BQD/n4H7lJzbApYSa4iGKRsBiBEZ/xwQzwDAYSxEzmyrAg7K/8WLtLMBl3WzrRHQqwMhKtmkwKzBACnH5qeNk+T+NrlLQHSY7wKmHsp3aMDtAYAgAyPNrMkChs78VvplAwMXoZcs0twJAnc8Fn16vK0DtTeg6do00wJGMVVu3PzXA5R9L2E1kN8AYajTDRmUMwJK6ufvX9B3AwP3rG1TXK0D+I3CzpRVCwDKRAJk9ah/AvzQtPsUrhj9uOyKWVwwhwM29PErcOCpAas//BuV0TUDNV16ejPMtwKYf4Ar+CTbABwWNTOqaGUCE/YB3UhgvwDB+HSu8ry/AIuhMYiHeIUCjkw+uqhYhwLsVuP0VLDHA/P/bcs8YM0B1f1twm1YWwOiCJENx6jPAtahTRuBqK8B5vSGQtNQqwGRrtrWIFSHAB+3SLs29KsDDbsucf7cswPfgza8mJCjArKXztWoAQEDQltyMgrIUQEjjEADIEzfAZbMMZav0OcDNOm95mhwswGBfH61wazLA0lBCumSOJsD7nI8CGpodQO5DYrccdytAroVbpG7gJEAhajwd9FIDQFNJCkk1GCfANEyrQX0RI8BTnRQZDCEWQC35iyEuu0BAtkY4sthtNUDWsMwjYqJAwOtxGyAYXxrADDP7WzqS979Hz1GC+7HJvxcFKZK0uSjA8WFwNIJSO0C+LOu0eD4lwJVJhohB1zJAYJYeKbM1N8DOHrikG5I2wOzpr25+rTDAzBVdoWSwBsCJ1q0ySyYpwDrm4ym+ryNAdZmXJGPaJkA=\",\"dtype\":\"float64\",\"shape\":[91]}},\"selected\":{\"id\":\"1182\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1183\",\"type\":\"UnionRenderers\"}},\"id\":\"1160\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"axis_label\":\"PC1 (8.19%)\",\"formatter\":{\"id\":\"1176\",\"type\":\"BasicTickFormatter\"},\"plot\":{\"id\":\"1120\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1131\",\"type\":\"BasicTicker\"}},\"id\":\"1130\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1131\",\"type\":\"BasicTicker\"},{\"attributes\":{\"fill_color\":{\"value\":\"blue\"},\"line_color\":{\"value\":\"blue\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1161\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1182\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1183\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"plot\":{\"id\":\"1120\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1131\",\"type\":\"BasicTicker\"}},\"id\":\"1134\",\"type\":\"Grid\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1162\",\"type\":\"Circle\"},{\"attributes\":{\"axis_label\":\"PC2 (4.34%)\",\"formatter\":{\"id\":\"1178\",\"type\":\"BasicTickFormatter\"},\"plot\":{\"id\":\"1120\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1136\",\"type\":\"BasicTicker\"}},\"id\":\"1135\",\"type\":\"LinearAxis\"},{\"attributes\":{\"data_source\":{\"id\":\"1160\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1161\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1162\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1164\",\"type\":\"CDSView\"}},\"id\":\"1163\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1136\",\"type\":\"BasicTicker\"},{\"attributes\":{\"source\":{\"id\":\"1160\",\"type\":\"ColumnDataSource\"}},\"id\":\"1164\",\"type\":\"CDSView\"},{\"attributes\":{\"dimension\":1,\"plot\":{\"id\":\"1120\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1136\",\"type\":\"BasicTicker\"}},\"id\":\"1139\",\"type\":\"Grid\"},{\"attributes\":{},\"id\":\"1176\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"1155\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1156\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1157\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1159\",\"type\":\"CDSView\"}},\"id\":\"1158\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1178\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1157\",\"type\":\"Circle\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1140\",\"type\":\"PanTool\"},{\"id\":\"1141\",\"type\":\"WheelZoomTool\"},{\"id\":\"1142\",\"type\":\"BoxZoomTool\"},{\"id\":\"1143\",\"type\":\"SaveTool\"},{\"id\":\"1144\",\"type\":\"ResetTool\"},{\"id\":\"1145\",\"type\":\"HelpTool\"}]},\"id\":\"1146\",\"type\":\"Toolbar\"},{\"attributes\":{\"callback\":null},\"id\":\"1122\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1140\",\"type\":\"PanTool\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"1X9C4JTUFsAPMQZdMXtMQEpAIGo+z0dAcpvnhkTmKMB/LnH9hfAvQISwi4ZRfeQ/xP5nDbXJRUCF3SBUeoQzQKHPFkfyPkdAlIay/JnpUUD5goOERC9SQKbbtQt0GfW/jwWzTntZJEDpbAKvtBVJQOh6DKTIIlNAl1KdzTFbQkDtQFDvngFMQOe8CKR7gUJAHeD56j+NPkBidvLaoYstwJ8kPOtnvxtAdz/h7tjzFEA4qAX1hXFOQKby/k4sDk1As4MDCMldOEDmu6DUxFA0QOHh5wvu9VFA0NVg9y4xN0CDBAp6Gkc4QKFHQsaY3EFAlrcAsmXUSUBqCHHahFE5QGKqumJBkSHAnXlL1nTsDkBSRmeCHag1QOPe5ly720JA1rP7uaLgREBWZFck068tQHcOO0ATFlRAwY25Hb7+SEA2G16SRofxvylcl8EucSNAzuxD5JNKUECtKauKvAxQQHf42lRFIlNA\",\"dtype\":\"float64\",\"shape\":[45]},\"y\":{\"__ndarray__\":\"21TAGVIYNEAxN/XrrFw2wCLo2kvHBgHA+QbwSvt/NMBLDKu2xZIJwEFBMUbrxRZAmWgzf1KfIcCgk+GrdME3wA0+1TaJphhAo7TQ8QLXz78lfn3M2uw9wHt9IYgNc0JAQ+4mGIhNL0C3KifR9vEoQJMUaUZIn0PArE8xUjVBNkB7gnvaefk1wE4TaCGepChA9t9o5q20F8CLsXIB8IEowD0ilrTKByRAt/WGBsnEQsAJmrBHoulFQOS2fxWGVzTA9VFhU3BDFkBkvugxVO1BQD/2w55MqgrA84qLxKXaTUAXHRzTiDQ8QM8pTBhtbjNAxeY1wUCaNUBDNiCG7hFGQNvt2ZamKUNAPmN4hXMpSUDCAG3k8AI1QLE2qtErjyxAKZ1OUt0xRUCn3Rl4s5lDQNzKy+j5hxVALaPZhE4LQMC39eFimkk/QKAUzSSpIChAqHpuETg0OcDggg/wbaY2wHOxrRFm/hjA\",\"dtype\":\"float64\",\"shape\":[45]}},\"selected\":{\"id\":\"1180\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1181\",\"type\":\"UnionRenderers\"}},\"id\":\"1155\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1180\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1141\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"fill_color\":{\"value\":\"orange\"},\"line_color\":{\"value\":\"orange\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1156\",\"type\":\"Circle\"},{\"attributes\":{\"overlay\":{\"id\":\"1148\",\"type\":\"BoxAnnotation\"}},\"id\":\"1142\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1143\",\"type\":\"SaveTool\"},{\"attributes\":{\"plot\":null,\"text\":\"PCA of Train data\"},\"id\":\"1119\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1144\",\"type\":\"ResetTool\"},{\"attributes\":{\"callback\":null},\"id\":\"1124\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1145\",\"type\":\"HelpTool\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"plot\":null,\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1148\",\"type\":\"BoxAnnotation\"}],\"root_ids\":[\"1120\"]},\"title\":\"Bokeh Application\",\"version\":\"1.0.4\"}};\n", + " var render_items = [{\"docid\":\"f57e77d2-50b6-4fe8-9b68-73bc176d8aa4\",\"roots\":{\"1120\":\"f8d517b1-7536-41c2-bd90-e501e6257548\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " clearInterval(timer);\n", + " }\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " clearInterval(timer);\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1120" + } + }, + "output_type": "display_data" + } + ], "source": [ "vars = pca.explained_variance_ratio_\n", "p = figure(plot_width=400, plot_height=400, title=\"PCA of Train data\")\n", @@ -718,11 +2076,76 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function embed_document(root) {\n", + " \n", + " var docs_json = {\"a711a3da-5605-4548-b1d5-3dad3feddda6\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1694\",\"type\":\"LinearAxis\"}],\"left\":[{\"id\":\"1699\",\"type\":\"LinearAxis\"}],\"plot_height\":400,\"plot_width\":400,\"renderers\":[{\"id\":\"1694\",\"type\":\"LinearAxis\"},{\"id\":\"1698\",\"type\":\"Grid\"},{\"id\":\"1699\",\"type\":\"LinearAxis\"},{\"id\":\"1703\",\"type\":\"Grid\"},{\"id\":\"1712\",\"type\":\"BoxAnnotation\"},{\"id\":\"1722\",\"type\":\"GlyphRenderer\"},{\"id\":\"1727\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1683\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1710\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1686\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"1690\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"1688\",\"type\":\"DataRange1d\"},\"y_scale\":{\"id\":\"1692\",\"type\":\"LinearScale\"}},\"id\":\"1684\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1780\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"plot\":null,\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1712\",\"type\":\"BoxAnnotation\"},{\"attributes\":{},\"id\":\"1782\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1708\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1704\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1784\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1707\",\"type\":\"SaveTool\"},{\"attributes\":{\"source\":{\"id\":\"1724\",\"type\":\"ColumnDataSource\"}},\"id\":\"1728\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1785\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1721\",\"type\":\"Circle\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"gvEGsejdMEDZEmf8C/oSwN2SQ4VulUPApLHbdbaHC8AfAQ85LOoswAm2axj5tjvAFRnR0a/ENMCOrdDhNKQmwHiFaCl9xjnAvn3i28XxPMAhgbA8VE4zwIi3jrSgb0PAB4whJiYMOcA8oFXfbMNCwJamZifZPjHA2Bk2DE+FN8CKB/fWBsBCwAXcEmgKzjzAVW+mC7AiPMANn/CbPE09wMLtZeIYqjbAhRmr4yomQMBQYo3T04o7wC1a+MUtE/S/ySnjGc4VGUDpcqCY41glwEkf6/v+2z7AFAKh3UnjNMB3pDRfrs81wELjtKeBEDTA+6I3/hCMQMCekojVpmExwBwYdaUbXzHAQa6rzMnBMsBMNNisTY1BwN9QmRa/9BPA8Ss8w2raJcBKrXMfc+0NwOD1RTgkoDXAXT5Atg+CNcAk4p8ebSYlwA1HshmK2hZAYxNekO0xNcA+eXCbc580wPvk9MujFzHAYW+a82WRIcDs6YEy4Ro6wK9bHHbK8QrAcfUWejhSF8Aq3BYDC7IWwAnr8PYDKCbAynCD+gvxM8Bz34Ok/B44wAb4uezujuE/8pP4o7kRIcCUQ3BOEoUewIUphsoiiDjALHfDo+5YNsBdoY07OIAiQB//TAWfNhTAA7D7iA24IMBVz/cB9SdDwDERfxQJEEHADKZHiGDQFcByVV5KtTk7wMCyLlsjCDfAqKJYjJsxI0D++ayQUyE2wGyJTJEu+zHAyHj3KHDIPUCrWyQ700svwHuc5OIlZDzAYmyf6uzfKEDQdyxK6/EGQNMeAPQi6gLA+wd68PljNsA0Iy/KhqIrwM6iwJByICjABQdYq7JOMsB7O+t+BrcywE0pOVc5yzrAzBNxk/guKsBgrHG2VwonwPupzOLvHzXAlvs6/NCKEsAUtUGf4KspwFj3jbHc6zDASFd95pIFJUALIejyrUwzwEGor6YZlzbA\",\"dtype\":\"float64\",\"shape\":[90]},\"y\":{\"__ndarray__\":\"+Vx517bmQMAwkSni7ic6wLjbjhEP60RAQMk9PATYJ8Ag9KYTLyLlv7+p9BLacjNAZ9/bd9RT8T93zDOJvOZBwNjOOuTacfS/seLk5puENECrwwVSsZ/4v+E5odnNc0JAPKeONaarDECIkzcHu49HQJqETSbNvTTAWn64UCYM9z/yTSVa+VpJQNhbc97Jqz5AOA96A3iF7L9eU9ivd3QRQBpsjskPYTzARXIbF9m3LcB3HzZQlqwwQCsSAN4fh0TAMsGXkFZZN8CEfFKEbW5CwLU4rrYTQCrAA2re83DZI0CqteRI7kAwwAdbBeWbbjXAQipj8H96OcCLjmF2gw4lwMUB0zYcqDzANKT/X5E4QMC2VD6RBY8qwJ1WId3vmTLAGoqaD68DOcB3KjRfdH8OwMtFs8jU2gVAOoBVdVhCI8CsyKluL3w1wJvKcluBfUTA6xz5HSASOMBul/7itDQ6QKoyTPlfJzjASMNoT4fR2r92yXNewU4uQJ0EoJO6iyvAEfljR3OHN8DjRpmwQqNBwKvH8J80FCPACWmxJqNr9T9jG2NQdisMwA1UoGcBMjTA6LesxDwaKUAteRL3Q/ElwElqzkhy0SzAZT7jyphUPsC9gzIEYoo1wH/ZC1ePiy7ApOda/zIbOcCGjQa1RcQzQEbSH8heWOU/zFnPC291QsBJMtyC52wLwH/KH+2wNivAYW7TknqMMMBRgd6lwLMewIMZRjYjrx/AypxNhrxgRMB/o4Vwq1guwLLmHu7KAfg/go67lX9hNMDtncbA4tgwQMyatAMRK0xAmPBMIjetIEBC+rYhAFlAwHUJbSdczUHA193ORg/8JcA1e7slFuUNwJKm7mJTW0RAqCiO02woGcC8Upwe2isnwObcc1VYn+W/NtyF6SFzKMCP9N8kExIiQDw75fYUEPo/zt4gqOpEQMCiP1lOpd8kQFBhlQMsMj9A\",\"dtype\":\"float64\",\"shape\":[90]}},\"selected\":{\"id\":\"1786\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1787\",\"type\":\"UnionRenderers\"}},\"id\":\"1724\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"data_source\":{\"id\":\"1719\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1720\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1721\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1723\",\"type\":\"CDSView\"}},\"id\":\"1722\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1786\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1705\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1724\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1725\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1726\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1728\",\"type\":\"CDSView\"}},\"id\":\"1727\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1700\",\"type\":\"BasicTicker\"},{\"attributes\":{\"fill_color\":{\"value\":\"blue\"},\"line_color\":{\"value\":\"blue\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1725\",\"type\":\"Circle\"},{\"attributes\":{\"axis_label\":\"PC2 (5.10%)\",\"formatter\":{\"id\":\"1782\",\"type\":\"BasicTickFormatter\"},\"plot\":{\"id\":\"1684\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1700\",\"type\":\"BasicTicker\"}},\"id\":\"1699\",\"type\":\"LinearAxis\"},{\"attributes\":{\"callback\":null},\"id\":\"1688\",\"type\":\"DataRange1d\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1704\",\"type\":\"PanTool\"},{\"id\":\"1705\",\"type\":\"WheelZoomTool\"},{\"id\":\"1706\",\"type\":\"BoxZoomTool\"},{\"id\":\"1707\",\"type\":\"SaveTool\"},{\"id\":\"1708\",\"type\":\"ResetTool\"},{\"id\":\"1709\",\"type\":\"HelpTool\"}]},\"id\":\"1710\",\"type\":\"Toolbar\"},{\"attributes\":{\"overlay\":{\"id\":\"1712\",\"type\":\"BoxAnnotation\"}},\"id\":\"1706\",\"type\":\"BoxZoomTool\"},{\"attributes\":{},\"id\":\"1787\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"callback\":null},\"id\":\"1686\",\"type\":\"DataRange1d\"},{\"attributes\":{\"plot\":null,\"text\":\"PCA of Test data\"},\"id\":\"1683\",\"type\":\"Title\"},{\"attributes\":{\"fill_color\":{\"value\":\"orange\"},\"line_color\":{\"value\":\"orange\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1720\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1692\",\"type\":\"LinearScale\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"RCNQWuwKMkA+BPua+ntMQCHbl6qyX0lAH0tP/DhtDEATqyv8BnUuwFik0gflex1A+WpeBVveLEB0dct5x3tDQEDPxuVkRRhAvqfzdvzkNkCQ9hVpP00vQLsOUK64M/4/VxQ+NnbYPUC796RIo11HQHWslQq34ElAGA5y8nciPsA0VyX+rYY4QEkOPuW2G0FAmT2M+tBwUEDlv85UAnpRQB4DF/ZSTjRAzGoXraIJUUBL3uBgkNNIQKT2quSkYD5ArfkH++auLECdOHyaNjxFQMhvlX2VKExAXa97QAXmSUB8dEMaZSdQQFwsFteQUjhA8CVBIoJnPUDO20sgO1wzQPaem7CvcfA/qwlRP87LVEDXXgPAGEs5QBrN5DJXlC/AAH3D5CUZQEC1ZnQhE9ktQGBdCTn+0DNAB1cAbyifQUBRsnD9nHM0QAtQtu2/LU1AJVI75Pj9UkBC6E8/CA8yQIAiObiscjlAR7MXCqRTSUA=\",\"dtype\":\"float64\",\"shape\":[46]},\"y\":{\"__ndarray__\":\"4zDKEjvOF8Ah5QFrihQjwMEKJ1dL9y7Am3uqnV8UGkCX/4KjULE9QFCbAUwNNzZA7sVnucLkOUAmDeCJDCs9QEPNEV8xJRnAq8i2dQSXQ0C6CzEj6VY+QNs8pZdoiRdAQWGAxY2OJECMhDDWLAc6QN/+DbMd7RDAx/zhGqlkQkAOfltd0DUYwERyrzaeKx/As4rw7XkUDUD6ZgN0OQ/fv2cOBDuyNDlAf4O5/TspJsCi/Rz7fIkIQHznzI8cYzhA7QDMfOMyMUBnqXZg2qMZwC1mrW+4mCfAX9rW4Os+F0DIvCrM+sggwC+te5KzmAtAIUiQ6rPpI0BrmJd0wjpPQH54JbPUeFNAcjVsAsAF1L8u4PMCwQY2wET1nLNUV09AXZx4dG9mLUDbmEMbdU4qQFoO//aLpztArNQQngA0KkBdR/qJ/FUwQP4i569fURLAAfZtWTfb7j/4bLigJ4hGQLPuCjo30ztAGHhcSdkrEEA=\",\"dtype\":\"float64\",\"shape\":[46]}},\"selected\":{\"id\":\"1784\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1785\",\"type\":\"UnionRenderers\"}},\"id\":\"1719\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1709\",\"type\":\"HelpTool\"},{\"attributes\":{},\"id\":\"1690\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1719\",\"type\":\"ColumnDataSource\"}},\"id\":\"1723\",\"type\":\"CDSView\"},{\"attributes\":{\"axis_label\":\"PC1 (7.44%)\",\"formatter\":{\"id\":\"1780\",\"type\":\"BasicTickFormatter\"},\"plot\":{\"id\":\"1684\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1695\",\"type\":\"BasicTicker\"}},\"id\":\"1694\",\"type\":\"LinearAxis\"},{\"attributes\":{},\"id\":\"1695\",\"type\":\"BasicTicker\"},{\"attributes\":{\"plot\":{\"id\":\"1684\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1695\",\"type\":\"BasicTicker\"}},\"id\":\"1698\",\"type\":\"Grid\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1726\",\"type\":\"Circle\"},{\"attributes\":{\"dimension\":1,\"plot\":{\"id\":\"1684\",\"subtype\":\"Figure\",\"type\":\"Plot\"},\"ticker\":{\"id\":\"1700\",\"type\":\"BasicTicker\"}},\"id\":\"1703\",\"type\":\"Grid\"}],\"root_ids\":[\"1684\"]},\"title\":\"Bokeh Application\",\"version\":\"1.0.4\"}};\n", + " var render_items = [{\"docid\":\"a711a3da-5605-4548-b1d5-3dad3feddda6\",\"roots\":{\"1684\":\"36102d0f-2c44-4093-851c-ae3bc6a4acc1\"}}];\n", + " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", + "\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " } else {\n", + " var attempts = 0;\n", + " var timer = setInterval(function(root) {\n", + " if (root.Bokeh !== undefined) {\n", + " embed_document(root);\n", + " clearInterval(timer);\n", + " }\n", + " attempts++;\n", + " if (attempts > 100) {\n", + " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", + " clearInterval(timer);\n", + " }\n", + " }, 10, root)\n", + " }\n", + "})(window);" + ], + "application/vnd.bokehjs_exec.v0+json": "" + }, + "metadata": { + "application/vnd.bokehjs_exec.v0+json": { + "id": "1684" + } + }, + "output_type": "display_data" + } + ], "source": [ - "## exercise here" + "## exercise here\n", + "z_ts = pca.fit_transform(x_ts)\n", + "pca.fit(x_ts)\n", + "z_ts = pca.transform(x_ts)\n", + "\n", + "vars = pca.explained_variance_ratio_\n", + "p = figure(plot_width=400, plot_height=400, title=\"PCA of Test data\")\n", + "p.circle(z_ts[y_ts==0, 0], z_ts[y_ts==0, 1], line_color=\"orange\", fill_color=\"orange\")\n", + "p.circle(z_ts[y_ts==1, 0], z_ts[y_ts==1, 1], line_color=\"blue\", fill_color=\"blue\")\n", + "p.xaxis.axis_label = \"PC1 (%.2f%%)\" % (100*vars[0])\n", + "p.yaxis.axis_label = \"PC2 (%.2f%%)\" % (100*vars[1])\n", + "show(p)" ] }, { @@ -761,7 +2184,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": { "colab": {}, "colab_type": "code", @@ -774,21 +2197,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "metadata": { "colab": {}, "colab_type": "code", "id": "Cg8TpDATk3XI", "outputId": "e9658389-474c-4bf5-f196-11d4518311b7" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", + " metric_params=None, n_jobs=None, n_neighbors=10, p=2,\n", + " weights='uniform')" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "knn.fit(x_tr, y_tr)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": { "colab": {}, "colab_type": "code", @@ -845,13 +2281,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": { "colab": {}, "colab_type": "code", "id": "JISD2EVQ9Q9Z" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[27, 19],\n", + " [ 0, 90]])" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import confusion_matrix\n", "conf = confusion_matrix(y_ts, y_pred_knn)\n", @@ -870,13 +2318,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": { "colab": {}, "colab_type": "code", "id": "pZVN8GKKdOhy" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "46" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "np.sum(y_ts==0) # total number of \"class 0\" samples in the test set" ] @@ -893,13 +2352,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "metadata": { "colab": {}, "colab_type": "code", "id": "1PVj7JbxdVk0" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "90" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "np.sum(y_ts==1) # total number of \"class 1\" samples in the test set" ] @@ -928,13 +2398,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": { "colab": {}, "colab_type": "code", "id": "-1-40TyQeAIt" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8602941176470589" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "(conf[0,0] + conf[1,1])/y_ts.shape[0] # y_ts.shape[0] is the sample size of the test set" ] @@ -951,13 +2432,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": { "colab": {}, "colab_type": "code", "id": "q0emRGAvfWi4" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8602941176470589\n" + ] + } + ], "source": [ "tp = conf[1,1]\n", "tn = conf[0,0]\n", @@ -984,13 +2473,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": { "colab": {}, "colab_type": "code", "id": "a9JlR-LNe5ZI" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "conf[1,1] / (conf[1,1] + conf[1,0])" ] @@ -1021,13 +2521,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": { "colab": {}, "colab_type": "code", "id": "3KeLJcCbkSo6" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8602941176470589" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import accuracy_score\n", "accuracy_score(y_ts, y_pred_knn)" @@ -1045,13 +2556,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": { "colab": {}, "colab_type": "code", "id": "MgfhssjZmsg3" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import recall_score\n", "recall_score(y_ts, y_pred_knn)" @@ -1069,13 +2591,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": { "colab": {}, "colab_type": "code", "id": "AKiUXIkPm-N3" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8602941176470589\n", + "1.0\n" + ] + } + ], "source": [ "from sklearn import metrics\n", "print(metrics.accuracy_score(y_ts, y_pred_knn))\n", @@ -1094,14 +2625,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": { "colab": {}, "colab_type": "code", "id": "HXgvIJM2k3XQ", "outputId": "0d2d0773-a292-40cb-d8e7-df6b6ee29ff2" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 0.59 0.74 46\n", + " 1 0.83 1.00 0.90 90\n", + "\n", + " micro avg 0.86 0.86 0.86 136\n", + " macro avg 0.91 0.79 0.82 136\n", + "weighted avg 0.88 0.86 0.85 136\n", + "\n" + ] + } + ], "source": [ "from sklearn import metrics\n", "print(metrics.classification_report(y_ts, y_pred_knn))" @@ -1159,14 +2706,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": { "colab": {}, "colab_type": "code", "id": "OuoRfictk3XW", "outputId": "9119acba-9d18-4076-eb3c-8346ba420579" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6961630553262051\n" + ] + } + ], "source": [ "print(metrics.matthews_corrcoef(y_ts, y_pred_knn))" ] @@ -1241,7 +2796,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": { "colab": {}, "colab_type": "code", @@ -1291,13 +2846,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": { "colab": {}, "colab_type": "code", "id": "n12boA3k3Neo" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy = 0.750\n", + "MCC = 0.538\n" + ] + } + ], "source": [ "from sklearn import metrics\n", "knn = neighbors.KNeighborsClassifier(n_neighbors=10)\n", @@ -1356,7 +2920,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": { "colab": {}, "colab_type": "code", @@ -1390,13 +2954,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": { "colab": {}, "colab_type": "code", "id": "-uoahY6yNcIv" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Fold 2 / 5 ###\n", + "TRAIN size: 108\n", + "-- class 0: 36 class 1: 72\n", + "TEST size: 28\n", + "-- class 0: 9 class 1: 19\n", + "\n", + "Model performance\n", + "Accuracy on TEST set: 0.929\n", + "MCC on TEST set: 0.839\n", + "\n", + "### Fold 3 / 5 ###\n", + "TRAIN size: 109\n", + "-- class 0: 36 class 1: 73\n", + "TEST size: 27\n", + "-- class 0: 9 class 1: 18\n", + "\n", + "Model performance\n", + "Accuracy on TEST set: 0.852\n", + "MCC on TEST set: 0.674\n", + "\n", + "### Fold 4 / 5 ###\n", + "TRAIN size: 109\n", + "-- class 0: 36 class 1: 73\n", + "TEST size: 27\n", + "-- class 0: 9 class 1: 18\n", + "\n", + "Model performance\n", + "Accuracy on TEST set: 0.815\n", + "MCC on TEST set: 0.567\n", + "\n", + "### Fold 5 / 5 ###\n", + "TRAIN size: 109\n", + "-- class 0: 36 class 1: 73\n", + "TEST size: 27\n", + "-- class 0: 9 class 1: 18\n", + "\n", + "Model performance\n", + "Accuracy on TEST set: 0.815\n", + "MCC on TEST set: 0.590\n", + "\n", + "### Fold 6 / 5 ###\n", + "TRAIN size: 109\n", + "-- class 0: 36 class 1: 73\n", + "TEST size: 27\n", + "-- class 0: 9 class 1: 18\n", + "\n", + "Model performance\n", + "Accuracy on TEST set: 0.926\n", + "MCC on TEST set: 0.837\n", + "\n" + ] + } + ], "source": [ "## get the number of splitting operations\n", "N = skf.get_n_splits(x_tr, y_tr)\n", @@ -1451,9 +3072,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average cross-validation accuracy: 0.867\n", + "Average cross-validation MCC: 0.701\n" + ] + } + ], "source": [ "## note: we need to convert the lists to numpy arrays before computing the means\n", "acc_avg = np.mean(np.array(acc_list))\n", @@ -1472,13 +3102,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": { "colab": {}, "colab_type": "code", "id": "-uoahY6yNcIv" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Iteration 1 ###\n", + "### Iteration 2 ###\n", + "### Iteration 3 ###\n", + "### Iteration 4 ###\n", + "### Iteration 5 ###\n", + "### Iteration 6 ###\n", + "### Iteration 7 ###\n", + "### Iteration 8 ###\n", + "### Iteration 9 ###\n", + "### Iteration 10 ###\n" + ] + } + ], "source": [ "## how many repetitions?\n", "N_CV = 10\n", @@ -1572,7 +3219,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/chierici_practical_part2.ipynb b/chierici_practical_part2.ipynb index a658b47..9106d90 100644 --- a/chierici_practical_part2.ipynb +++ b/chierici_practical_part2.ipynb @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "colab": {}, "colab_type": "code", @@ -56,9 +56,300 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " var force = true;\n", + "\n", + " if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + " var JS_MIME_TYPE = 'application/javascript';\n", + " var HTML_MIME_TYPE = 'text/html';\n", + " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " var CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " var script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " var cell = handle.cell;\n", + "\n", + " var id = cell.output_area._bokeh_element_id;\n", + " var server_id = cell.output_area._bokeh_server_id;\n", + " // Clean up Bokeh references\n", + " if (id != null && id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " var id = msg.content.text.trim();\n", + " if (id in Bokeh.index) {\n", + " Bokeh.index[id].model.document.clear();\n", + " delete Bokeh.index[id];\n", + " }\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " var output_area = handle.output_area;\n", + " var output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " var bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " var script_attrs = bk_div.children[0].attributes;\n", + " for (var i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " var toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " var events = require('base/js/events');\n", + " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + "\n", + " \n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " var NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"
    \\n\"+\n", + " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", + " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", + " \"
\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " var el = document.getElementById(\"1001\");\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", + " }\n", + " finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.info(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(js_urls, callback) {\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = js_urls.length;\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " var s = document.createElement('script');\n", + " s.src = url;\n", + " s.async = false;\n", + " s.onreadystatechange = s.onload = function() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", + " run_callbacks()\n", + " }\n", + " };\n", + " s.onerror = function() {\n", + " console.warn(\"failed to load library \" + url);\n", + " };\n", + " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", + " }\n", + " };var element = document.getElementById(\"1001\");\n", + " if (element == null) {\n", + " console.log(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n", + " return false;\n", + " }\n", + "\n", + " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.0.4.min.js\"];\n", + "\n", + " var inline_js = [\n", + " function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + " \n", + " function(Bokeh) {\n", + " \n", + " },\n", + " function(Bokeh) {\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " \n", + " if ((root.Bokeh !== undefined) || (force === true)) {\n", + " for (var i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }if (force === true) {\n", + " display_loaded();\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + "\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(js_urls, function() {\n", + " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof (root._bokeh_onload_callbacks) === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1001\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n }\n finally {\n delete root._bokeh_onload_callbacks\n }\n console.info(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(js_urls, callback) {\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = js_urls.length;\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var s = document.createElement('script');\n s.src = url;\n s.async = false;\n s.onreadystatechange = s.onload = function() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: all BokehJS libraries loaded\");\n run_callbacks()\n }\n };\n s.onerror = function() {\n console.warn(\"failed to load library \" + url);\n };\n console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.getElementsByTagName(\"head\")[0].appendChild(s);\n }\n };var element = document.getElementById(\"1001\");\n if (element == null) {\n console.log(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n return false;\n }\n\n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.0.4.min.js\"];\n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n \n function(Bokeh) {\n \n },\n function(Bokeh) {\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-1.0.4.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.0.4.min.css\");\n console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.0.4.min.css\");\n }\n ];\n\n function run_inline_js() {\n \n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(js_urls, function() {\n console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "output_notebook()" ] @@ -75,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -85,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "colab": {}, "colab_type": "code", @@ -111,16 +402,246 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": {}, "colab_type": "code", "id": "SBBKnUO-LKKS" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sampleIDALB.Gene_AceViewCD24L4.1.Gene_AceViewRPS11.Gene_RefSeqRPS18.Gene_AceViewC5orf13.Gene_AceViewCCT2.Gene_AceViewCOL1A1.Gene_AceViewDDX1.Gene_AceViewEEF1A1.Gene_AceView...zawskaw.Gene_AceViewzeedor.Gene_AceViewzergor.Gene_AceViewzorsa.Gene_AceViewzoychabu.Gene_AceViewzoysteeby.Gene_AceViewzudee.Gene_AceViewzureyby.Gene_AceViewzuswoybu.Gene_AceViewzyjee.Gene_AceView
0SEQC_NB0019.2918.8221.1720.9020.0216.3118.6015.7321.71...0.00.000.00.000.00.00.000.00.00.0
1SEQC_NB0039.2520.2522.4422.0021.0517.0619.3922.8422.72...0.05.540.03.390.00.05.450.00.00.0
2SEQC_NB0058.9920.0922.0921.7121.6516.8523.0215.7922.24...0.00.000.03.750.00.00.000.00.00.0
3SEQC_NB0117.3219.8220.5220.9021.5816.4918.9115.4522.06...0.00.000.00.000.00.00.000.00.00.0
4SEQC_NB01310.5621.1920.6921.2920.2816.2217.1516.0121.84...0.00.000.05.200.00.00.000.00.00.0
\n", + "

5 rows × 52230 columns

\n", + "
" + ], + "text/plain": [ + " sampleID ALB.Gene_AceView CD24L4.1.Gene_AceView RPS11.Gene_RefSeq \\\n", + "0 SEQC_NB001 9.29 18.82 21.17 \n", + "1 SEQC_NB003 9.25 20.25 22.44 \n", + "2 SEQC_NB005 8.99 20.09 22.09 \n", + "3 SEQC_NB011 7.32 19.82 20.52 \n", + "4 SEQC_NB013 10.56 21.19 20.69 \n", + "\n", + " RPS18.Gene_AceView C5orf13.Gene_AceView CCT2.Gene_AceView \\\n", + "0 20.90 20.02 16.31 \n", + "1 22.00 21.05 17.06 \n", + "2 21.71 21.65 16.85 \n", + "3 20.90 21.58 16.49 \n", + "4 21.29 20.28 16.22 \n", + "\n", + " COL1A1.Gene_AceView DDX1.Gene_AceView EEF1A1.Gene_AceView ... \\\n", + "0 18.60 15.73 21.71 ... \n", + "1 19.39 22.84 22.72 ... \n", + "2 23.02 15.79 22.24 ... \n", + "3 18.91 15.45 22.06 ... \n", + "4 17.15 16.01 21.84 ... \n", + "\n", + " zawskaw.Gene_AceView zeedor.Gene_AceView zergor.Gene_AceView \\\n", + "0 0.0 0.00 0.0 \n", + "1 0.0 5.54 0.0 \n", + "2 0.0 0.00 0.0 \n", + "3 0.0 0.00 0.0 \n", + "4 0.0 0.00 0.0 \n", + "\n", + " zorsa.Gene_AceView zoychabu.Gene_AceView zoysteeby.Gene_AceView \\\n", + "0 0.00 0.0 0.0 \n", + "1 3.39 0.0 0.0 \n", + "2 3.75 0.0 0.0 \n", + "3 0.00 0.0 0.0 \n", + "4 5.20 0.0 0.0 \n", + "\n", + " zudee.Gene_AceView zureyby.Gene_AceView zuswoybu.Gene_AceView \\\n", + "0 0.00 0.0 0.0 \n", + "1 5.45 0.0 0.0 \n", + "2 0.00 0.0 0.0 \n", + "3 0.00 0.0 0.0 \n", + "4 0.00 0.0 0.0 \n", + "\n", + " zyjee.Gene_AceView \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "\n", + "[5 rows x 52230 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data_tr = pd.read_csv(DATA_TR, sep = \"\\t\")\n", - "data_ts = pd.read_csv(DATA_TS, sep = \"\\t\")" + "data_ts = pd.read_csv(DATA_TS, sep = \"\\t\")\n", + "\n", + "data_tr.head()" ] }, { @@ -145,11 +666,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "colab": {}, "colab_type": "code", - "id": "-Ejm4N6xLKKW" + "id": "-Ejm4N6xLKKW", + "scrolled": true }, "outputs": [], "source": [ @@ -157,6 +679,243 @@ "data_ts = data_ts.drop('sampleID',axis =1)" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ALB.Gene_AceViewCD24L4.1.Gene_AceViewRPS11.Gene_RefSeqRPS18.Gene_AceViewC5orf13.Gene_AceViewCCT2.Gene_AceViewCOL1A1.Gene_AceViewDDX1.Gene_AceViewEEF1A1.Gene_AceViewFLT3LG_.Gene_AceView...zawskaw.Gene_AceViewzeedor.Gene_AceViewzergor.Gene_AceViewzorsa.Gene_AceViewzoychabu.Gene_AceViewzoysteeby.Gene_AceViewzudee.Gene_AceViewzureyby.Gene_AceViewzuswoybu.Gene_AceViewzyjee.Gene_AceView
09.2918.8221.1720.9020.0216.3118.6015.7321.7120.02...0.00.000.00.000.00.00.000.00.00.0
19.2520.2522.4422.0021.0517.0619.3922.8422.7221.26...0.05.540.03.390.00.05.450.00.00.0
28.9920.0922.0921.7121.6516.8523.0215.7922.2420.75...0.00.000.03.750.00.00.000.00.00.0
37.3219.8220.5220.9021.5816.4918.9115.4522.0619.59...0.00.000.00.000.00.00.000.00.00.0
410.5621.1920.6921.2920.2816.2217.1516.0121.8419.74...0.00.000.05.200.00.00.000.00.00.0
\n", + "

5 rows × 52229 columns

\n", + "
" + ], + "text/plain": [ + " ALB.Gene_AceView CD24L4.1.Gene_AceView RPS11.Gene_RefSeq \\\n", + "0 9.29 18.82 21.17 \n", + "1 9.25 20.25 22.44 \n", + "2 8.99 20.09 22.09 \n", + "3 7.32 19.82 20.52 \n", + "4 10.56 21.19 20.69 \n", + "\n", + " RPS18.Gene_AceView C5orf13.Gene_AceView CCT2.Gene_AceView \\\n", + "0 20.90 20.02 16.31 \n", + "1 22.00 21.05 17.06 \n", + "2 21.71 21.65 16.85 \n", + "3 20.90 21.58 16.49 \n", + "4 21.29 20.28 16.22 \n", + "\n", + " COL1A1.Gene_AceView DDX1.Gene_AceView EEF1A1.Gene_AceView \\\n", + "0 18.60 15.73 21.71 \n", + "1 19.39 22.84 22.72 \n", + "2 23.02 15.79 22.24 \n", + "3 18.91 15.45 22.06 \n", + "4 17.15 16.01 21.84 \n", + "\n", + " FLT3LG_.Gene_AceView ... zawskaw.Gene_AceView zeedor.Gene_AceView \\\n", + "0 20.02 ... 0.0 0.00 \n", + "1 21.26 ... 0.0 5.54 \n", + "2 20.75 ... 0.0 0.00 \n", + "3 19.59 ... 0.0 0.00 \n", + "4 19.74 ... 0.0 0.00 \n", + "\n", + " zergor.Gene_AceView zorsa.Gene_AceView zoychabu.Gene_AceView \\\n", + "0 0.0 0.00 0.0 \n", + "1 0.0 3.39 0.0 \n", + "2 0.0 3.75 0.0 \n", + "3 0.0 0.00 0.0 \n", + "4 0.0 5.20 0.0 \n", + "\n", + " zoysteeby.Gene_AceView zudee.Gene_AceView zureyby.Gene_AceView \\\n", + "0 0.0 0.00 0.0 \n", + "1 0.0 5.45 0.0 \n", + "2 0.0 0.00 0.0 \n", + "3 0.0 0.00 0.0 \n", + "4 0.0 0.00 0.0 \n", + "\n", + " zuswoybu.Gene_AceView zyjee.Gene_AceView \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + "[5 rows x 52229 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_tr.head()" + ] + }, { "cell_type": "markdown", "metadata": { @@ -169,7 +928,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "colab": {}, "colab_type": "code", @@ -193,7 +952,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "colab": {}, "colab_type": "code", @@ -231,7 +990,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -246,7 +1005,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "colab": {}, "colab_type": "code", @@ -281,14 +1040,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "colab": {}, "colab_type": "code", "id": "Qqc3TmFBLKKn", "outputId": "d9ef6c64-9f18-4bea-9167-decaa0ca1820" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LibSVM]" + ] + } + ], "source": [ "## fit the model and get the predictions\n", "svc.fit(x_tr, y_tr)\n", @@ -307,14 +1074,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "colab": {}, "colab_type": "code", "id": "Ku0JSF_ALKKs", "outputId": "94585c0e-534a-445d-d0ba-92a9bf3a9388" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556\n" + ] + } + ], "source": [ "from sklearn import metrics\n", "print('MCC = ', metrics.matthews_corrcoef(class_lab_ts, class_pred_ts))\n", @@ -334,14 +1111,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "colab": {}, "colab_type": "code", "id": "whSZnHGALKKx", "outputId": "2c471734-3504-4af7-8ebb-74e5a02be301" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.91 0.93 0.92 46\n", + " 1 0.97 0.96 0.96 90\n", + "\n", + " micro avg 0.95 0.95 0.95 136\n", + " macro avg 0.94 0.95 0.94 136\n", + "weighted avg 0.95 0.95 0.95 136\n", + "\n" + ] + } + ], "source": [ "print(metrics.classification_report(class_lab_ts, class_pred_ts))" ] @@ -358,15 +1151,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "colab": {}, "colab_type": "code", "id": "ZT6XjB20LKK0" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.8357487922705314\n", + "ACC = 0.9264705882352942\n", + "SENS = 0.9444444444444444\n" + ] + } + ], "source": [ - "## space for exercise\n" + "## space for exercise\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "clf = RandomForestClassifier(150)\n", + "clf.fit(x_tr, y_tr)\n", + "y_ts = clf.predict(x_ts)\n", + "\n", + "print('MCC = ', metrics.matthews_corrcoef(class_lab_ts, y_ts))\n", + "print('ACC = ', metrics.accuracy_score(class_lab_ts, y_ts))\n", + "print('SENS = ', metrics.recall_score(class_lab_ts, y_ts))\n" ] }, { @@ -393,7 +1205,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "colab": {}, "colab_type": "code", @@ -401,7 +1213,58 @@ "outputId": "099e6404-c7fd-414a-b49a-4092af095c57", "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C = 1e-06\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/davidyue/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:543: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.0\n", + "ACC = 0.6617647058823529\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-05\n", + "MCC = 0.6310547428675068\n", + "ACC = 0.8308823529411765\n", + "SENS = 1.0 \n", + "\n", + "C = 0.0001\n", + "MCC = 0.9014492753623189\n", + "ACC = 0.9558823529411765\n", + "SENS = 0.9666666666666667 \n", + "\n", + "C = 0.001\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.01\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.1\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n" + ] + } + ], "source": [ "## define the sequence of C values we want to use in the search of the best one\n", "C_list = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]\n", @@ -437,15 +1300,209 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "colab": {}, "colab_type": "code", "id": "BPtC-EBSLKK_" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C = 1e-06 gamma = 0.001\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/davidyue/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:543: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.0\n", + "ACC = 0.6617647058823529\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-06 gamma = 0.01\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/davidyue/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:543: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.0\n", + "ACC = 0.6617647058823529\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-06 gamma = 0.1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/davidyue/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:543: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.0\n", + "ACC = 0.6617647058823529\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-06 gamma = 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/davidyue/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:543: RuntimeWarning: invalid value encountered in double_scalars\n", + " mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.0\n", + "ACC = 0.6617647058823529\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-05 gamma = 0.001\n", + "MCC = 0.6310547428675068\n", + "ACC = 0.8308823529411765\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-05 gamma = 0.01\n", + "MCC = 0.6310547428675068\n", + "ACC = 0.8308823529411765\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-05 gamma = 0.1\n", + "MCC = 0.6310547428675068\n", + "ACC = 0.8308823529411765\n", + "SENS = 1.0 \n", + "\n", + "C = 1e-05 gamma = 1\n", + "MCC = 0.6310547428675068\n", + "ACC = 0.8308823529411765\n", + "SENS = 1.0 \n", + "\n", + "C = 0.0001 gamma = 0.001\n", + "MCC = 0.9014492753623189\n", + "ACC = 0.9558823529411765\n", + "SENS = 0.9666666666666667 \n", + "\n", + "C = 0.0001 gamma = 0.01\n", + "MCC = 0.9014492753623189\n", + "ACC = 0.9558823529411765\n", + "SENS = 0.9666666666666667 \n", + "\n", + "C = 0.0001 gamma = 0.1\n", + "MCC = 0.9014492753623189\n", + "ACC = 0.9558823529411765\n", + "SENS = 0.9666666666666667 \n", + "\n", + "C = 0.0001 gamma = 1\n", + "MCC = 0.9014492753623189\n", + "ACC = 0.9558823529411765\n", + "SENS = 0.9666666666666667 \n", + "\n", + "C = 0.001 gamma = 0.001\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.001 gamma = 0.01\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.001 gamma = 0.1\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.001 gamma = 1\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.01 gamma = 0.001\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.01 gamma = 0.01\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.01 gamma = 0.1\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.01 gamma = 1\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.1 gamma = 0.001\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.1 gamma = 0.01\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.1 gamma = 0.1\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n", + "C = 0.1 gamma = 1\n", + "MCC = 0.8857501367027195\n", + "ACC = 0.9485294117647058\n", + "SENS = 0.9555555555555556 \n", + "\n" + ] + } + ], "source": [ - "## space for exercise" + "## space for exercise\n", + "C_list = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]\n", + "gamma_list = [0.001, 0.01, 0.1, 1]\n", + "for C in C_list:\n", + " for gamma in gamma_list:\n", + " print('C = ', C, ' gamma = ', gamma)\n", + " svc = svm.SVC(kernel = 'linear', C=C, gamma = gamma)\n", + " svc.fit(x_tr, class_lab_tr.values.ravel())\n", + " class_pred_ts = svc.predict(x_ts)\n", + " print('MCC = ', metrics.matthews_corrcoef(class_lab_ts, class_pred_ts))\n", + " print('ACC = ', metrics.accuracy_score(class_lab_ts, class_pred_ts))\n", + " print('SENS = ', metrics.recall_score(class_lab_ts, class_pred_ts), \"\\n\")" ] }, { @@ -460,14 +1517,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "colab": {}, "colab_type": "code", "id": "utM1ALBfLKLC", "outputId": "d96dc041-2f6f-4f1a-bca5-70310d1f79ee" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'C': 0.001, 'gamma': 0.001}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", @@ -505,14 +1573,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "colab": {}, "colab_type": "code", "id": "2lZAaTXJLKLH", "outputId": "2155231c-e50c-4c06-82c4-6b6a5f7c4ee2" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", + " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Build a forest and compute the feature importances\n", "rf = RandomForestClassifier(n_estimators=250)\n", @@ -531,14 +1616,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "colab": {}, "colab_type": "code", "id": "rspvHmO0LKLK", "outputId": "7b131d8f-ebc8-4d03-9f38-ad90de735367" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MCC = 0.8704408378703687\n", + "ACC = 0.9411764705882353\n", + "SENS = 0.9444444444444444\n" + ] + } + ], "source": [ "class_pred_ts = rf.predict(x_ts)\n", "print('MCC = ', metrics.matthews_corrcoef(class_lab_ts, class_pred_ts))\n", @@ -558,14 +1653,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "colab": {}, "colab_type": "code", "id": "7g9k5EHsLKLU", "outputId": "aa26094b-0e4a-48f0-be91-ecd2874ab204" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature ranking (top 10 features):\n", + "1. feature 1442 (0.010746)\n", + "2. feature 5892 (0.009566)\n", + "3. feature 8520 (0.009420)\n", + "4. feature 15 (0.009405)\n", + "5. feature 10739 (0.009211)\n", + "6. feature 8494 (0.008895)\n", + "7. feature 21477 (0.008595)\n", + "8. feature 11563 (0.008554)\n", + "9. feature 21809 (0.008044)\n", + "10. feature 5889 (0.007586)\n" + ] + } + ], "source": [ "importances = rf.feature_importances_\n", "indices = np.argsort(importances)[::-1]\n", @@ -588,14 +1701,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "colab": {}, "colab_type": "code", "id": "2fSkitN7LKLY", "outputId": "73191a71-9657-4582-ede6-7fb14cd3fc05" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PHLDB1.Gene_AceView\n", + "MAP7.Gene_AceView\n", + "CENPA.Gene_RefSeq\n", + "ODC1.Gene_AceView\n", + "POLA2.Gene_RefSeq\n", + "CDCA4.Gene_RefSeq\n", + "serloy.Gene_AceView\n", + "TIAF1.Gene_RefSeq\n", + "snawjarby.Gene_AceView\n", + "MAP3K12.Gene_AceView\n" + ] + } + ], "source": [ "columnsNamesArr = data_tr.columns.values\n", "for i in range(10):\n", @@ -626,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": { "colab": {}, "colab_type": "code", @@ -732,7 +1862,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4,