In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Attribute Information from
# https://archive.ics.uci.edu/ml/datasets/Flags
#    1. name:       Name of the country concerned

#    2. landmass:
#                   1=N.America,
#                   2=S.America,
#                   3=Europe,
#                   4=Africa,
#                   5=Asia,
#                   6=Oceania

#    3. zone: Geographic quadrant, based on Greenwich and the Equator
#                   1=NE,
#                   2=SE,
#                   3=SW,
#                   4=NW

#    4. area:       in thousands of square km

#    5. population: in round millions

#    6. language:
#                   1=English,
#                   2=Spanish,
#                   3=French,
#                   4=German,
#                   5=Slavic,
#                   6=other Indo-European,
#                   7=Chinese,
#                   8=Arabic, 
#                   9=Japanese/Turkish/Finnish/Magyar,
#                  10=others

#    7. religion:
#                   0=Catholic,
#                   1=Other Christian,
#                   2=Muslim,
#                   3=Buddhist,
#                   4=Hindu,
#                   5=ethnic,
#                   6=Marxist,
#                   7=others

#    8. bars:       number of vertical bars in the flag

#    9. stripes:    number of horizontal stripes in the flag

#   10. colours:    number of different colours in the flag

#   11. red:        0 if red absent, 1 if red present in the flag

#   12. green:      same for green

#   13. blue:       same for blue

#   14. gold:       same for gold (also yellow)

#   15. white:      same for white

#   16. black:      same for black

#   17. orange:     same for orange (also brown)

#   18. mainhue:    predominant colour in the flag (tie-breaks decided by taking the topmost hue,
#                   if that fails then the most central hue, and if that fails the leftmost hue)

#   19. circles:    number of circles in the flag

#   20. crosses:    number of (upright) crosses

#   21. saltires:   number of diagonal crosses

#   22. quarters:   number of quartered sections

#   23. sunstars:   number of sun or star symbols

#   24. crescent:   1 if a crescent moon symbol present, else 0

#   25. triangle:   1 if any triangles present, else 0

#   26. icon:       1 if an inanimate image present (e.g., a boat), else 0

#   27. animate:    1 if an animate image (e.g., an eagle, a tree, a human hand) present, else 0

#   28. text:       1 if any letters or writing on the flag (e.g., a motto or slogan), else 0

#   29. topleft:    color in the top-left corner (moving right to decide tie-breaks)

#   30. botright:   color in the bottom-left corner (moving left to decide tie-breaks)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
flag_data = pd.read_csv("flags.csv", names=['name', 'landmass', 'zone', 'area', 'population', 'language', 'religion', 'bars', 'stripes', 'colors', 'red', 'green', 'blue', 'gold', 'white', 'black', 'orange', 'mainhue', 'circles', 'crosses', 'saltires', 'quarters', 'sunstars', 'crescent', 'triangle', 'icon', 'animate', 'text', 'topleft', 'botright'])
flag_data

Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colors,red,green,blue,gold,white,black,orange,mainhue,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
0,Afghanistan,5,1,648,16,10,2,0,3,5,1,1,0,1,1,1,0,green,0,0,0,0,1,0,0,1,0,0,black,green
1,Albania,3,1,29,3,6,6,0,0,3,1,0,0,1,0,1,0,red,0,0,0,0,1,0,0,0,1,0,red,red
2,Algeria,4,1,2388,20,8,2,2,0,3,1,1,0,0,1,0,0,green,0,0,0,0,1,1,0,0,0,0,green,white
3,American-Samoa,6,3,0,0,1,1,0,0,5,1,0,1,1,1,0,1,blue,0,0,0,0,0,0,1,1,1,0,blue,red
4,Andorra,3,1,0,0,6,0,3,0,3,1,0,1,1,0,0,0,gold,0,0,0,0,0,0,0,0,0,0,blue,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,Western-Samoa,6,3,3,0,1,1,0,0,3,1,0,1,0,1,0,0,red,0,0,0,1,5,0,0,0,0,0,blue,red
190,Yugoslavia,3,1,256,22,6,6,0,3,4,1,0,1,1,1,0,0,red,0,0,0,0,1,0,0,0,0,0,blue,red
191,Zaire,4,2,905,28,10,5,0,0,4,1,1,0,1,0,0,1,green,1,0,0,0,0,0,0,1,1,0,green,green
192,Zambia,4,2,753,6,10,5,3,0,4,1,1,0,0,0,1,1,green,0,0,0,0,0,0,0,0,1,0,green,brown


## Select your features

In [5]:
target = flag_data['religion']

In [6]:
selected_features = flag_data.drop(columns=['name', 'religion'])
selected_features.head(20)

Unnamed: 0,landmass,zone,area,population,language,bars,stripes,colors,red,green,blue,gold,white,black,orange,mainhue,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
0,5,1,648,16,10,0,3,5,1,1,0,1,1,1,0,green,0,0,0,0,1,0,0,1,0,0,black,green
1,3,1,29,3,6,0,0,3,1,0,0,1,0,1,0,red,0,0,0,0,1,0,0,0,1,0,red,red
2,4,1,2388,20,8,2,0,3,1,1,0,0,1,0,0,green,0,0,0,0,1,1,0,0,0,0,green,white
3,6,3,0,0,1,0,0,5,1,0,1,1,1,0,1,blue,0,0,0,0,0,0,1,1,1,0,blue,red
4,3,1,0,0,6,3,0,3,1,0,1,1,0,0,0,gold,0,0,0,0,0,0,0,0,0,0,blue,red
5,4,2,1247,7,10,0,2,3,1,0,0,1,0,1,0,red,0,0,0,0,1,0,0,1,0,0,red,black
6,1,4,0,0,1,0,1,3,0,0,1,0,1,0,1,white,0,0,0,0,0,0,0,0,1,0,white,blue
7,1,4,0,0,1,0,1,5,1,0,1,1,1,1,0,red,0,0,0,0,1,0,1,0,0,0,black,red
8,2,3,2777,28,2,0,3,2,0,0,1,0,1,0,0,blue,0,0,0,0,0,0,0,0,0,0,blue,blue
9,2,3,2777,28,2,0,3,3,0,0,1,1,1,0,0,blue,0,0,0,0,1,0,0,0,0,0,blue,blue


In [7]:
selected_features = pd.get_dummies(selected_features)
selected_features.head()

Unnamed: 0,landmass,zone,area,population,language,bars,stripes,colors,red,green,blue,gold,white,black,orange,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,mainhue_black,mainhue_blue,mainhue_brown,mainhue_gold,mainhue_green,mainhue_orange,mainhue_red,mainhue_white,topleft_black,topleft_blue,topleft_gold,topleft_green,topleft_orange,topleft_red,topleft_white,botright_black,botright_blue,botright_brown,botright_gold,botright_green,botright_orange,botright_red,botright_white
0,5,1,648,16,10,0,3,5,1,1,0,1,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,3,1,29,3,6,0,0,3,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2,4,1,2388,20,8,2,0,3,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
3,6,3,0,0,1,0,0,5,1,0,1,1,1,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,3,1,0,0,6,3,0,3,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [8]:
print(selected_features.shape, target.shape)

(194, 48) (194,)


## Create a Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

## Pre-Processing

In [10]:
# Scale your data
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)

##  Train the Model

In [11]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
 # Create the model using LinearRegression

### BEGIN SOLUTION
from sklearn.linear_model import LinearRegression
model = LinearRegression()
### END SOLUTION

In [13]:
# Fit the model to the training data and calculate the scores for the training and testing data

### BEGIN SOLUTION
model.fit(X_train_scaled, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)

### END SOLUTION 

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.41483762448111494
Testing Score: -5.77415445226867e+22
