In [None]:
 %matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [None]:
# Attribute Information from
# https://archive.ics.uci.edu/ml/datasets/Flags
#    1. name:       Name of the country concerned

#    2. landmass:
#                   1=N.America,
#                   2=S.America,
#                   3=Europe,
#                   4=Africa,
#                   5=Asia,
#                   6=Oceania

#    3. zone: Geographic quadrant, based on Greenwich and the Equator
#                   1=NE,
#                   2=SE,
#                   3=SW,
#                   4=NW

#    4. area:       in thousands of square km

#    5. population: in round millions

#    6. language:
#                   1=English,
#                   2=Spanish,
#                   3=French,
#                   4=German,
#                   5=Slavic,
#                   6=other Indo-European,
#                   7=Chinese,
#                   8=Arabic, 
#                   9=Japanese/Turkish/Finnish/Magyar,
#                  10=others

#    7. religion:
#                   0=Catholic,
#                   1=Other Christian,
#                   2=Muslim,
#                   3=Buddhist,
#                   4=Hindu,
#                   5=ethnic,
#                   6=Marxist,
#                   7=others

#    8. bars:       number of vertical bars in the flag

#    9. stripes:    number of horizontal stripes in the flag

#   10. colours:    number of different colours in the flag

#   11. red:        0 if red absent, 1 if red present in the flag

#   12. green:      same for green

#   13. blue:       same for blue

#   14. gold:       same for gold (also yellow)

#   15. white:      same for white

#   16. black:      same for black

#   17. orange:     same for orange (also brown)

#   18. mainhue:    predominant colour in the flag (tie-breaks decided by taking the topmost hue,
#                   if that fails then the most central hue, and if that fails the leftmost hue)

#   19. circles:    number of circles in the flag

#   20. crosses:    number of (upright) crosses

#   21. saltires:   number of diagonal crosses

#   22. quarters:   number of quartered sections

#   23. sunstars:   number of sun or star symbols

#   24. crescent:   1 if a crescent moon symbol present, else 0

#   25. triangle:   1 if any triangles present, else 0

#   26. icon:       1 if an inanimate image present (e.g., a boat), else 0

#   27. animate:    1 if an animate image (e.g., an eagle, a tree, a human hand) present, else 0

#   28. text:       1 if any letters or writing on the flag (e.g., a motto or slogan), else 0

#   29. topleft:    color in the top-left corner (moving right to decide tie-breaks)

#   30. botright:   color in the bottom-left corner (moving left to decide tie-breaks)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
flag_data = pd.read_csv("flags.csv", names=['name', 'landmass', 'zone', 'area', 'population', 'language', 'religion', 'bars', 'stripes', 'colors', 'red', 'green', 'blue', 'gold', 'white', 'black', 'orange', 'mainhue', 'circles', 'crosses', 'saltires', 'quarters', 'sunstars', 'crescent', 'triangle', 'icon', 'animate', 'text', 'topleft', 'botright'])
flag_data

## Select your features

In [None]:
target = flag_data['religion']

In [None]:
selected_features = flag_data.drop(columns=['name', 'religion'])
selected_features.head(20)

In [None]:
selected_features = pd.get_dummies(selected_features)
selected_features.head()

In [8]:
print(selected_features.shape, target.shape)

(194, 48) (194,)


## Create a Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

## Pre-Processing

In [10]:
# Scale your data
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)

##  Train the Model

In [11]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
y_train

In [None]:
 # Create the model using LinearRegression

### BEGIN SOLUTION
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model_test = LinearRegression()
### END SOLUTION

In [13]:
# Fit the model to the training data and calculate the scores for the training and testing data

### BEGIN SOLUTION
model.fit(X_train_scaled, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)

### END SOLUTION 

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.41483762448111494
Testing Score: -5.77415445226867e+22


In [None]:
predictions = model.predict(X_train_scaled)
# Plot Residuals
plt.scatter(predictions, predictions - y_train, c="red")
plt.hlines(y=0, xmin=predictions.min(), xmax=predictions.max())
plt.title("Training Data Residual Plot")
plt.show()

In [None]:
model_test.fit(X_test_scaled, y_test)
test_predictions = model_test.predict(X_test_scaled)
# Plot Residuals
plt.scatter(test_predictions, test_predictions - y_test, c="green", label="Testing Data")
plt.hlines(y=0, xmin=test_predictions.min(), xmax=test_predictions.max())
plt.title("Testing Data Residual Plot")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test_scaled, y_test)

print(f"MSE: {MSE}, R2: {r2}")