diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 8818016..de3b0ba 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -19,7 +19,8 @@ "# Libraries\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn import datasets\n" + "from sklearn import datasets\n", + "import sklearn" ] }, { @@ -44,7 +45,7 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here\n" + "diabetes = sklearn.datasets.load_diabetes()" ] }, { @@ -71,7 +72,8 @@ } ], "source": [ - "# your code here\n" + "# your code here\n", + "diabetes.keys()" ] }, { @@ -89,7 +91,7 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [ { @@ -138,7 +140,7 @@ } ], "source": [ - "# your code here\n" + "print(diabetes[\"DESCR\"])" ] }, { @@ -158,9 +160,24 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(442, 10)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your answer here \n" + "# 1. There are 7 attributes:\n", + "len(diabetes.keys())\n", + "# 2. \"data\" is the independent feature and \"target\" the dependent feat.\n", + "# 3. There are 442 records in the data\n", + "diabetes[\"data\"].shape" ] }, { @@ -177,13 +194,6 @@ "execution_count": 6, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape of 'data' :\n" - ] - }, { "data": { "text/plain": [ @@ -196,7 +206,8 @@ } ], "source": [ - "# your code here\n" + "# your code here\n", + "diabetes[\"data\"].shape" ] }, { @@ -204,13 +215,6 @@ "execution_count": 7, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape of 'target' :\n" - ] - }, { "data": { "text/plain": [ @@ -222,7 +226,9 @@ "output_type": "execute_result" } ], - "source": [] + "source": [ + "diabetes[\"target\"].shape" + ] }, { "cell_type": "markdown", @@ -262,7 +268,8 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here\n" + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split" ] }, { @@ -278,7 +285,7 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here\n" + "diabetes_model = LinearRegression()" ] }, { @@ -296,7 +303,13 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here\n" + "X = diabetes[\"data\"]\n", + "y = diabetes[\"target\"]\n", + "\n", + "diabetes_data_train = X[:-20,:]\n", + "diabetes_data_test = X[-20:,:]\n", + "diabetes_target_train = y[:-20]\n", + "diabetes_target_test = y[-20:]" ] }, { @@ -308,7 +321,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -317,46 +330,53 @@ "LinearRegression()" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here\n" + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Intercept: 152.76430691633442\n" + "152.76430691633442\n" ] } ], - "source": [] + "source": [ + "# Intercept: 152.76430691633442\n", + "print(diabetes_model.intercept_)" + ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Coefficients: [ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", + "[ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", " -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02\n", " 7.43519617e+02 7.60951722e+01]\n" ] } ], - "source": [] + "source": [ + "# Coefficients: [ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", + "# -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02 7.43519617e+02 7.60951722e+01]\n", + "print(diabetes_model.coef_)" + ] }, { "cell_type": "markdown", @@ -376,11 +396,26 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([197.61846908, 155.43979328, 172.88665147, 111.53537279,\n", + " 164.80054784, 131.06954875, 259.12237761, 100.47935157,\n", + " 117.0601052 , 124.30503555, 218.36632793, 61.19831284,\n", + " 132.25046751, 120.3332925 , 52.54458691, 194.03798088,\n", + " 102.57139702, 123.56604987, 211.0346317 , 52.60335674])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here\n" + "diabetes_model.predict(diabetes_data_test)" ] }, { @@ -392,16 +427,9 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 15, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "diabetes_target_test:\n" - ] - }, { "data": { "text/plain": [ @@ -409,27 +437,22 @@ " 72., 49., 64., 48., 178., 104., 132., 220., 57.])" ] }, - "execution_count": 23, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here\n" + "# array([233., 91., 111., 152., 120., 67., 310., 94., 183., 66., 173.,\n", + "# 72., 49., 64., 48., 178., 104., 132., 220., 57.])\n", + "diabetes_target_test" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "test prediction:\n" - ] - }, { "data": { "text/plain": [ @@ -440,12 +463,19 @@ " 102.57139702, 123.56604987, 211.0346317 , 52.60335674])" ] }, - "execution_count": 24, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], - "source": [] + "source": [ + "# array([197.61846908, 155.43979328, 172.88665147, 111.53537279,\n", + "# 164.80054784, 131.06954875, 259.12237761, 100.47935157,\n", + "# 117.0601052 , 124.30503555, 218.36632793, 61.19831284,\n", + "# 132.25046751, 120.3332925 , 52.54458691, 194.03798088,\n", + "# 102.57139702, 123.56604987, 211.0346317 , 52.60335674])\n", + "diabetes_model.predict(diabetes_data_test)" + ] }, { "cell_type": "markdown", @@ -456,11 +486,11 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "# your answer here \n" + "# No. It's different. The test size is very small in order to do a better prediction." ] }, { @@ -495,50 +525,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " OLS Regression Results \n", - "==============================================================================\n", - "Dep. Variable: y R-squared: 0.512\n", - "Model: OLS Adj. R-squared: 0.500\n", - "Method: Least Squares F-statistic: 43.16\n", - "Date: Mon, 10 May 2021 Prob (F-statistic): 4.64e-58\n", - "Time: 17:52:26 Log-Likelihood: -2281.1\n", - "No. Observations: 422 AIC: 4584.\n", - "Df Residuals: 411 BIC: 4629.\n", - "Df Model: 10 \n", - "Covariance Type: nonrobust \n", - "==============================================================================\n", - " coef std err t P>|t| [0.025 0.975]\n", - "------------------------------------------------------------------------------\n", - "x1 0.3035 61.286 0.005 0.996 -120.169 120.776\n", - "x2 -237.6393 62.837 -3.782 0.000 -361.162 -114.117\n", - "x3 510.5306 68.156 7.491 0.000 376.553 644.508\n", - "x4 327.7370 66.876 4.901 0.000 196.275 459.199\n", - "x5 -814.1317 424.044 -1.920 0.056 -1647.697 19.434\n", - "x6 492.8146 344.227 1.432 0.153 -183.850 1169.480\n", - "x7 102.8485 219.463 0.469 0.640 -328.561 534.258\n", - "x8 184.6065 167.336 1.103 0.271 -144.334 513.547\n", - "x9 743.5196 175.359 4.240 0.000 398.807 1088.232\n", - "x10 76.0952 68.293 1.114 0.266 -58.152 210.343\n", - "const 152.7643 2.658 57.469 0.000 147.539 157.990\n", - "==============================================================================\n", - "Omnibus: 1.544 Durbin-Watson: 2.026\n", - "Prob(Omnibus): 0.462 Jarque-Bera (JB): 1.421\n", - "Skew: 0.004 Prob(JB): 0.491\n", - "Kurtosis: 2.716 Cond. No. 224.\n", - "==============================================================================\n", - "\n", - "Warnings:\n", - "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" - ] - } - ], + "outputs": [], "source": [ "# your code here\n", "import statsmodels.api as sm\n", @@ -562,11 +551,11 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "# your answer here\n" + "# your answer here" ] }, { @@ -587,11 +576,11 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "# your code here\n" + "auto = pd.read_csv(r\"C:\\Users\\trito.DESKTOP-V9IEUOR\\Desktop\\IronHack Data Analyst\\Labs\\week7labs\\lab-supervised-learning-sklearn\\data\\auto-mpg.csv\")" ] }, { @@ -603,7 +592,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -713,13 +702,13 @@ "4 70 \\t\"ford torino\" " ] }, - "execution_count": 27, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here\n" + "auto.head()" ] }, { @@ -731,7 +720,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -752,12 +741,28 @@ " 6 model_year 398 non-null int64 \n", " 7 car_name 398 non-null object \n", "dtypes: float64(4), int64(3), object(1)\n", - "memory usage: 25.0+ KB\n" + "memory usage: 25.0+ KB\n", + "\n", + "RangeIndex: 398 entries, 0 to 397\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 mpg 398 non-null float64\n", + " 1 cylinders 398 non-null int64 \n", + " 2 displacement 398 non-null float64\n", + " 3 horse_power 392 non-null float64\n", + " 4 weight 398 non-null int64 \n", + " 5 acceleration 398 non-null float64\n", + " 6 model_year 398 non-null int64 \n", + "dtypes: float64(4), int64(3)\n", + "memory usage: 21.9 KB\n" ] } ], "source": [ - "# your code here\n" + "auto.info() # car_name is not numeric.\n", + "auto = auto.drop(columns=[\"car_name\"])\n", + "auto.info()" ] }, { @@ -769,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -778,19 +783,20 @@ "70" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# your code here\n", - "# OLDEST MODEL\n" + "# OLDEST MODEL 70\n", + "auto[\"model_year\"].min()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -799,13 +805,14 @@ "82" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# NEWEST MODEL \n" + "# NEWEST MODEL 82\n", + "auto[\"model_year\"].max()" ] }, { @@ -817,11 +824,59 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 25, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 6\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auto.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 0\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here\n" + "auto = auto.dropna()\n", + "auto.isnull().sum()" ] }, { @@ -833,7 +888,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -847,13 +902,13 @@ "Name: cylinders, dtype: int64" ] }, - "execution_count": 29, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here \n" + "auto[\"cylinders\"].value_counts() # THere are 5 possible values." ] }, { @@ -867,13 +922,28 @@ "*Hint: To separate data for training and test, use the `train_test_split` method we used in previous labs.*" ] }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "auto.columns\n", + "X = auto.iloc[:,1:]\n", + "y = auto.iloc[:,0]" + ] + }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "# your code here\n" + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=8)" ] }, { @@ -887,11 +957,23 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here\n" + "auto_model = LinearRegression()\n", + "auto_model.fit(X_train,y_train)" ] }, { @@ -921,22 +1003,26 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.8198690008457218" + "0.8081736333891638" ] }, - "execution_count": 38, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here\n" + "y_pred = auto_model.predict(X_train)\n", + "\n", + "from sklearn.metrics import r2_score\n", + "\n", + "r2_score(y_train, y_pred)" ] }, { @@ -952,22 +1038,24 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.7507754274816084" + "0.8085475967919091" ] }, - "execution_count": 39, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here\n" + "y_test_pred = auto_model.predict(X_test)\n", + "\n", + "r2_score(y_test,y_test_pred)" ] }, { @@ -985,7 +1073,7 @@ "metadata": {}, "outputs": [], "source": [ - "# your answer here\n" + "# Both R2 values are pretty similar, this meaning that the model is well fitted. " ] }, { @@ -1001,11 +1089,11 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "# your code here\n" + "X_train09, X_test09, y_train09, y_test09 = train_test_split(X, y, test_size=0.90, random_state=1)" ] }, { @@ -1017,11 +1105,23 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here\n" + "auto_model09 = LinearRegression()\n", + "auto_model09.fit(X_train09,y_train09)" ] }, { @@ -1033,22 +1133,32 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_train_09 = auto_model09.predict(X_train09)\n", + "y_pred_test_09 = auto_model09.predict(X_test09)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.8109551916128583" + "0.9181704244987553" ] }, - "execution_count": 39, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here\n" + "r2_score(y_train09,y_pred_train_09)" ] }, { @@ -1060,22 +1170,22 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.7913151386161112" + "0.7608096464950158" ] }, - "execution_count": 40, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# your code here\n" + "r2_score(y_test09,y_pred_test_09) # There is an improvement." ] }, { @@ -1217,7 +1327,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.8.5" } }, "nbformat": 4,