In [None]:
import pandas as pd

#Find the dataset here: https://github.com/fivethirtyeight/data/tree/master/star-wars-survey
star_wars = pd.read_csv('star_wars.csv',encoding='ISO-8859-1')
star_wars.head(10)

In [None]:
print(star_wars.shape)

In [None]:
star_wars = star_wars[pd.notnull(star_wars['RespondentID']) == True]
print(star_wars.shape)

In [None]:
#Before we convert the values in columns to Boolean values, let's first look 
#at the unique column values and their counts
star_wars['Have you seen any of the 6 films in the Star Wars franchise?'].value_counts()

In [None]:
#Now taking the "Yes" and "No" values in columns of interest and converting 
#them to bools using the series method map(dict)
yes_no = {"Yes": True, "No": False}
star_wars['Have you seen any of the 6 films in the Star Wars franchise?'] = star_wars['Have you seen any of the 6 films in the Star Wars franchise?'].map(yes_no)
star_wars['Do you consider yourself to be a fan of the Star Wars film franchise?'] = star_wars['Do you consider yourself to be a fan of the Star Wars film franchise?'].map(yes_no)

#Now, let's reexamine the new unique value counts
star_wars['Have you seen any of the 6 films in the Star Wars franchise?'].value_counts()

In [None]:
star_wars['Which of the following Star Wars films have you seen? Please select all that apply.'].unique()

In [None]:
star_wars_cols = {'Which of the following Star Wars films have you seen? Please select all that apply.':'seen_1',
                    'Unnamed: 4':'seen_2',
                   'Unnamed: 5':'seen_3',
                    'Unnamed: 6':'seen_4',
                    'Unnamed: 7':'seen_5',
                    'Unnamed: 8':'seen_6'}

#Renaming columns so they are easier to deal with
star_wars = star_wars.rename(columns=star_wars_cols)
star_wars.head(3)

In [None]:
import numpy as np

star_wars_bools = {'Star Wars: Episode I  The Phantom Menace':True,
                    'Star Wars: Episode II  Attack of the Clones':True,
                   'Star Wars: Episode III  Revenge of the Sith':True,
                    'Star Wars: Episode IV  A New Hope':True,
                    'Star Wars: Episode V The Empire Strikes Back':True,
                    'Star Wars: Episode VI Return of the Jedi':True,
                   np.nan:False}

#Now assigning a bool value to whether or not the person saw each star wars film
for col in list(star_wars.columns[3:9]):
    star_wars[col] = star_wars[col].map(star_wars_bools) 

In [None]:
#Now converting rank columns to float values
star_wars[star_wars.columns[9:15]] = star_wars[star_wars.columns[9:15]].astype(float)

In [None]:
#Renaming the ranking columns using the rename() series method
rank_cols = {'Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.': 'ranking_1',
            'Unnamed: 10': 'ranking_2',
            'Unnamed: 11': 'ranking_3',
            'Unnamed: 12': 'ranking_4',
            'Unnamed: 13': 'ranking_5',
            'Unnamed: 14': 'ranking_6'}

star_wars = star_wars.rename(columns=rank_cols)

In [None]:
star_wars[star_wars.columns[9:15]].mean()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

#Creating bar graph of star wars rankings
plt.bar(range(6),star_wars[star_wars.columns[9:15]].mean())

plt.show()

Up until now, we have cleaned up some of the checked box data into a usable format, and we have gone ahead and mapped some of the responses to useable Boolean values.
Also, we have gone ahead and put the ranking columns into a useable format so that we could create the above bar plot. This plot shows the mean ranking of each Star Wars film, the lower the mean ranking, the higher it ranks in the fans' minds. A low mean rank means it is well-liked. 
The bar plot above shows that the lowest ranked film on average is episode 3, while the highest ranked film on average is episode 4. This confirms FiveThirtyEight's posit that The Empire Strikes back is the best Star Wars film.

In [None]:
#Now using the sum() method to compute the sum of the 'seen' columns
star_wars[star_wars.columns[3:9]].sum()

In [None]:
plt.bar(range(6),star_wars[star_wars.columns[3:9]].sum())

plt.show()

This graph is in keeping with the bar graph of the rankings. It seems that the least amount of people have seen episode 3 (also the lowest ranked,) and the most people have seen episode 5 which is also the hightest ranked.

In [None]:
#Splitting the df based on the gender column
males = star_wars[star_wars['Gender'] ==  'Male']
females = star_wars[star_wars['Gender'] ==  'Female']

In [None]:
#Running the rank analysis for the male df
males[males.columns[9:15]].mean()

In [None]:
#Creating a bar graph of the rankings for males
plt.bar(range(6),males[males.columns[9:15]].mean())

plt.show()

We see that the males tend to rank the older films (episodes 3 through 6) higher than the later films (episodes 1 thorugh 3). Still, however, episode 5 is the highest ranked and episode 3 is the lowest ranked.

In [None]:
#Running a rank analysis for the female df
females[females.columns[9:15]].mean()

In [None]:
#Creating a bar graph of the rankings for females
plt.bar(range(6),females[females.columns[9:15]].mean())

plt.show()

We can see that, for the females, there is a lot more variation in the rankings of the first three films. Yet, the female dataset is in keeping with the trend; episode 3 is ranked lowest and episode 5 is ranked highest.

In [None]:
#Running the seen column analysis for the male df
males[males.columns[3:9]].sum()

In [None]:
#Now plotting the seen columns 
plt.bar(range(6),males[males.columns[3:9]].sum())

plt.show()

In [None]:
#Running the seen column analysis for the female df
females[females.columns[3:9]].sum()

In [None]:
#Now plotting the seen columns for the female df
plt.bar(range(6),females[females.columns[3:9]].sum())

plt.show()

These two bar plots show us that, in general, much less women have seen the Star Wars films, especially the newer ones (episodes 1 through 3.)

Try to segment the data by columns like Education, Location (Census Region), and Which character shot first?, which aren't binary. Are they any interesting patterns?
Clean up columns 15 to 29, which have to do with what characters are viewed favorably and unfavorably.
Which character is the most liked?
Which character is the most disliked?
Which character creates the most controversy? (split between dislikes and likes)