# Data Analysis of Star Wars Survey Dataset

#### Chance Mason, Nicolas Arrieche Villegas, Mitchell Walker, Tyler Wittig

## Part 3. Feature Engineering

In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline

In [3]:
# Read the data from csv file
with open('column_names.txt', 'r') as cn:
    col_names = [line.strip() for line in cn]
    
data = pd.read_csv('survey_data.csv')

In [4]:
# remove RespondentID column
data.drop(['RespondentID'],axis=1, inplace=True)
col_names.remove('RespondentID')

data.head()

Unnamed: 0,Seen a Star Wars film,Fan of Star Wars,Seen The Phantom Menace,Seen Attack of the Clones,Seen Revenge of the Sith,Seen A New Hope,Seen The Empire Strikes Back,Seen Return of the Jedi,Rank for The Phantom Menace,Rank for Attack of the Clones,...,View of Yoda,Which character shot first?,Familiar with the Expanded Universe?,Fan of the Expanded Universe?,Star Trek Fan,Gender,Age,Household Income,Education,Location (Census Region)
0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,3.0,2.0,...,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,?,High school degree,South Atlantic
1,No,Maybe,No,No,No,No,No,No,0.0,0.0,...,Unfamiliar (N/A),I don't understand this question,?,?,Yes,Male,18-29,"$0 - $24,999",Bachelor degree,West South Central
2,Yes,No,Yes,Yes,Yes,No,No,No,1.0,2.0,...,Unfamiliar (N/A),I don't understand this question,No,?,No,Male,18-29,"$0 - $24,999",High school degree,West North Central
3,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,5.0,6.0,...,Very favorably,I don't understand this question,No,?,Yes,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central
4,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,5.0,4.0,...,Somewhat favorably,Greedo,Yes,No,No,Male,18-29,"$100,000 - $149,999",Some college or Associate degree,West North Central


### 3.1 Convert Categorical Data to Numeric

#### We will convert the values of each column to numeric data separately, as follows:
* **"Seen a Star Wars film"**  
"No" = 0, "Yes" = 1
* **"Fan of Star Wars"**  
"No" = -1, "Maybe" = 0, "Yes" = 1
* **Seen {Movie}**  
"No" = 0, "Yes" = 1
* **Rank for {Movie}**  
Unchanged (1-6, or 0 if missing value) 
* **View of {Character}**  
"Very unfavorably" = -2, "Somewhat unfavorably" = -1, "Neither favorably nor unfavorably (neutral)" = 0, "Somewhat favorably" = 1, "Very favorably" = 2  
We will set the "Unfamiliar (N/A)" answer to 0, since that answer indicates a neutral response.
* **"Which character shot first?"**  
"Han" = -1, "I don't understand this question" = 0, "Greedo" = 1
* **{Familiar with, Fan of} the Expanded Universe?**  
"No" = -1, "?" = 0, "Yes" = 1
* **Star Trek Fan**  
"No" = -1, "Maybe" = 0, "Yes" = 1
* **Gender**  
Male = 0, Female = 1
* **Age**
"18-29" = 1, "30-44" = 2, "45-60" = 3, ">60" = 4, "?" = 0
* **Household Income**
"$0 - $24,999" = 1, "$25,000 - $49,999" = 2, "$50,000 - $99,999" = 3, "$100,000 - $149,999" = 4, "$150,000+" = 5, "?" = 0  
* **Education**
"Less than high school degree" = 1, "High school degree" = 2, "Some college or Associate degree" = 3, "Bachelor degree" = 4, "Graduate degree" = 5, "?" = 0
* **Location (Census Region)**  
"South Atlantic" = 1, "West South Central" = 2, "West North Central" = 3, "Middle Atlantic" = 4, "East North Central" = 5, "Pacific" = 6, "Mountain" = 7, "New England" = 8, "East South Central" = 9, "?" = 0

In [5]:
# convert categorical to numeric

# "Seen a Star Wars film"
data["Seen a Star Wars film"].replace("No", 0, inplace=True)
data["Seen a Star Wars film"].replace("Yes", 1, inplace=True)

# "Fan of Star Wars"
data["Fan of Star Wars"].replace("No", -1, inplace=True)
data["Fan of Star Wars"].replace("Maybe", 0, inplace=True)
data["Fan of Star Wars"].replace("Yes", 1, inplace=True)

# "Seen {Movie}"
seen_cols = col_names[2:8]
for col in seen_cols:
    data[col].replace("No", 0, inplace=True)
    data[col].replace("Yes", 1, inplace=True)
    
# "Rank for {Movie}"
# unchanged
    
# "View of {Character}"
view_cols = col_names[14:28]
for col in view_cols:
    data[col].replace("Very unfavorably", -2, inplace=True)  
    data[col].replace("Somewhat unfavorably", -1, inplace=True) 
    data[col].replace("Neither favorably nor unfavorably (neutral)", 0, inplace=True)  
    data[col].replace("Somewhat favorably", 1, inplace=True) 
    data[col].replace("Very favorably", 2, inplace=True) 
    data[col].replace("Unfamiliar (N/A)", 0, inplace=True)  

# "Which character shot first?"
data["Which character shot first?"].replace("Han", -1, inplace=True)
data["Which character shot first?"].replace("I don't understand this question", 0, inplace=True)
data["Which character shot first?"].replace("Greedo", 1, inplace=True)

# "{Familiar with, Fan of} the Expanded Universe?"  
univ_cols = col_names[29:31]
for col in univ_cols:
    data[col].replace("No", -1, inplace=True)
    data[col].replace("?", 0, inplace=True)
    data[col].replace("Yes", 1, inplace=True)

# "Star Trek Fan" 
data["Star Trek Fan"].replace("No", -1, inplace=True)
data["Star Trek Fan"].replace("Maybe", 0, inplace=True)
data["Star Trek Fan"].replace("Yes", 1, inplace=True)

# "Gender"
data["Gender"].replace("Male", -1, inplace=True)
data["Gender"].replace("?", 0, inplace=True)
data["Gender"].replace("Female", 1, inplace=True)

# "Age"
data["Age"].replace("18-29", 1, inplace=True)
data["Age"].replace("30-44", 2, inplace=True)
data["Age"].replace("45-60", 3, inplace=True)
data["Age"].replace("> 60", 4, inplace=True)
data["Age"].replace("?", 0, inplace=True)

# "Household Income"
data["Household Income"].replace("$0 - $24,999", 1, inplace=True)
data["Household Income"].replace("$25,000 - $49,999", 2, inplace=True)
data["Household Income"].replace("$50,000 - $99,999", 3, inplace=True)
data["Household Income"].replace("$100,000 - $149,999", 4, inplace=True)
data["Household Income"].replace("$150,000+", 5, inplace=True)
data["Household Income"].replace("?", 0, inplace=True)

# "Education"
data["Education"].replace("Less than high school degree", 1, inplace=True)
data["Education"].replace("High school degree", 2, inplace=True)
data["Education"].replace("Some college or Associate degree", 3, inplace=True)
data["Education"].replace("Bachelor degree", 4, inplace=True)
data["Education"].replace("Graduate degree", 5, inplace=True)
data["Education"].replace("?", 0, inplace=True)

# "Location (Census Region)"
data["Location (Census Region)"].replace("South Atlantic", 1, inplace=True)
data["Location (Census Region)"].replace("West South Central", 2, inplace=True)
data["Location (Census Region)"].replace("West North Central", 3, inplace=True)
data["Location (Census Region)"].replace("Middle Atlantic", 4, inplace=True)
data["Location (Census Region)"].replace("East North Central", 5, inplace=True)
data["Location (Census Region)"].replace("Pacific", 6, inplace=True)
data["Location (Census Region)"].replace("Mountain", 7, inplace=True)
data["Location (Census Region)"].replace("New England", 8, inplace=True)
data["Location (Census Region)"].replace("East South Central", 9, inplace=True)
data["Location (Census Region)"].replace("?", 0, inplace=True)

data.astype(int)
data.head()

Unnamed: 0,Seen a Star Wars film,Fan of Star Wars,Seen The Phantom Menace,Seen Attack of the Clones,Seen Revenge of the Sith,Seen A New Hope,Seen The Empire Strikes Back,Seen Return of the Jedi,Rank for The Phantom Menace,Rank for Attack of the Clones,...,View of Yoda,Which character shot first?,Familiar with the Expanded Universe?,Fan of the Expanded Universe?,Star Trek Fan,Gender,Age,Household Income,Education,Location (Census Region)
0,1,1,1,1,1,1,1,1,3.0,2.0,...,2,0,1,-1,-1,-1,1,0,2,1
1,0,0,0,0,0,0,0,0,0.0,0.0,...,0,0,0,0,1,-1,1,1,4,2
2,1,-1,1,1,1,0,0,0,1.0,2.0,...,0,0,-1,0,-1,-1,1,1,2,3
3,1,1,1,1,1,1,1,1,5.0,6.0,...,2,0,-1,0,1,-1,1,4,3,3
4,1,1,1,1,1,1,1,1,5.0,4.0,...,1,1,1,-1,-1,-1,1,4,3,3


### Write Numeric Dataset to CSV

In [6]:
data.to_csv('survey_numeric.csv', index=False)