# IST 652 Final Project - A Pantheon Exploration

In [None]:
#os.getcwd()
#os.path.isfile("data/database.csv")

In [6]:
import os
import pandas as pd
from matplotlib import pyplot

# Introduction

Research questions:

* Which historic characters are the most popular?
* When did they live and where are they from?
* What factors could ahve generated their popularity?
* Are there any observable trends in the categorical data provided?
* What associations exist between the variables in the data?
* What clusters and groupings are in the data? How do groups compared in popularity?

# Analysis

Key analysis methods used in report:
* Data Cleaning
* Sorting and subsets of the data.
* Line and bar plots.
* Multiple regression

## About the Data

In [88]:
# Loading data from .csv file
dataFileName = "data/database.csv"
isExist = os.path.isfile(dataFileName)
if isExist == True:
    dfDirtyData = pd.read_csv(dataFileName, sep=",", header=0)
else:
    print("File not found:", os.getcwd())


In [13]:
# Showing inforamtion about the dataframe
dfDirtyData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11341 entries, 0 to 11340
Data columns (total 17 columns):
article_id                     11341 non-null int64
full_name                      11341 non-null object
sex                            11341 non-null object
birth_year                     11341 non-null object
city                           11341 non-null object
state                          2172 non-null object
country                        11308 non-null object
continent                      11311 non-null object
latitude                       10294 non-null float64
longitude                      10294 non-null float64
occupation                     11341 non-null object
industry                       11341 non-null object
domain                         11341 non-null object
article_languages              11341 non-null int64
page_views                     11341 non-null int64
average_views                  11341 non-null int64
historical_popularity_index    11341 non-null 

In [76]:
# Creating a varibale to hold fields for displaying purpose since not all will fit
columnToDisplayDirtyData = ['full_name', 'sex', 'birth_year', 'country', 'occupation','historical_popularity_index']

In [52]:
# Showing a single row of record
dfDirtyData.loc[2]

article_id                    1095706
full_name                Jesus Christ
sex                              Male
birth_year                         -4
city                            Judea
state                             NaN
country                        Israel
continent                        Asia
latitude                         32.5
longitude                        34.9
occupation           Religious Figure
industry                     Religion
domain                   Institutions
article_languages                 214
page_views                   60299092
average_views                  281771
popularity                    31.8981
Name: 2, dtype: object

In [77]:
# Showing the first 5 rows of the data in the data frame
dfDirtyData.loc[:,columnToDisplayDirtyData].head(5)

Unnamed: 0,full_name,sex,birth_year,country,occupation,historical_popularity_index
0,Aristotle,Male,-384,Greece,Philosopher,31.9938
1,Plato,Male,-427,Greece,Philosopher,31.9888
2,Jesus Christ,Male,-4,Israel,Religious Figure,31.8981
3,Socrates,Male,-469,Greece,Philosopher,31.6521
4,Alexander the Great,Male,-356,Greece,Military Personnel,31.584


In [78]:
# Showing the first 5 rows of the data in the data frame
dfDirtyData.loc[:,columnToDisplayDirtyData].tail(5)

Unnamed: 0,full_name,sex,birth_year,country,occupation,historical_popularity_index
11336,Sean St Ledger,Male,1984,United Kingdom,Soccer Player,11.1346
11337,Saina Nehwal,Female,1990,India,Athlete,10.6122
11338,Rūta Meilutytė,Female,1997,Lithuania,Swimmer,10.3821
11339,Vladimír Weiss,Male,1989,Slovakia,Soccer Player,10.2495
11340,Missy Franklin,Female,1995,United States,Swimmer,9.8794


## Data Cleaning

In [89]:
# Creating a new copy of a dataframe from the original
dfCleanData = dfDirtyData

In [90]:
# Finding the index of a column with alpha numberic values.
def findStringValueIndex(fieldName):
    badRowIndex = [] # initialize list of problematic rows
    for idx, year in enumerate(dfCleanData[fieldName]):
        try:
            int(year)
        except:
            badRowIndex.append(idx)
    return badRowIndex

In [112]:
# Renaming historical_popularity_index to popularity (short name)
dfCleanData.rename(columns={'historical_popularity_index': 'popularity'}, inplace=True)

# List of columns to be converted into numeric data type
columnToConvert = ["article_languages", "birth_year", "latitude", "longitude", "page_views","average_views", "popularity"]

# Making sure to replace the non-numeric values to 0 before converting to numeric
for col in columnToConvert:
    indx = []
    indx = findStringValueIndex(col)
    if len(indx) > 0 :
        dfCleanData.loc[indx, col] = 0

In [113]:
# Changing the columns that look like numbers but show up as object to numeric
dfCleanData[columnToConvert] = dfCleanData[columnToConvert].apply(pd.to_numeric)

In [114]:
# Showing inforamtion about the dataframe
dfCleanData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11341 entries, 0 to 11340
Data columns (total 17 columns):
article_id           11341 non-null int64
full_name            11341 non-null object
sex                  11341 non-null object
birth_year           11341 non-null int64
city                 11341 non-null object
state                2172 non-null object
country              11308 non-null object
continent            11311 non-null object
latitude             11341 non-null float64
longitude            11341 non-null float64
occupation           11341 non-null object
industry             11341 non-null object
domain               11341 non-null object
article_languages    11341 non-null int64
page_views           11341 non-null int64
average_views        11341 non-null int64
popularity           11341 non-null float64
dtypes: float64(3), int64(5), object(9)
memory usage: 1.5+ MB


In [110]:
# Creating a varibale to hold fields for displaying purpose since not all will fit
columnToDisplayCleanData = ['full_name', 'sex', 'birth_year', 'country', 'occupation','popularity','longitude','latitude','page_views','average_views']

In [45]:
# Showing the first 5 rows of the data in the dataset
dfCleanData.loc[:,columnToDisplayCleanData].head(5)

Unnamed: 0,full_name,sex,birth_year,country,occupation,popularity,longitude,latitude,page_views,average_views
0,Aristotle,Male,-384,Greece,Philosopher,31.9938,23.5,40.33333,56355172,370758
1,Plato,Male,-427,Greece,Philosopher,31.9888,23.71667,37.96667,46812003,329662
2,Jesus Christ,Male,-4,Israel,Religious Figure,31.8981,34.9,32.5,60299092,281771
3,Socrates,Male,-469,Greece,Philosopher,31.6521,23.71667,37.96667,40307143,294213
4,Alexander the Great,Male,-356,Greece,Military Personnel,31.584,22.51667,40.8,48358148,350421


In [46]:
# Showing the last 5 rows of the data in the dataset
dfCleanData.loc[:,columnToDisplayCleanData].tail(5)

Unnamed: 0,full_name,sex,birth_year,country,occupation,popularity,longitude,latitude,page_views,average_views
11336,Sean St Ledger,Male,1984,United Kingdom,Soccer Player,11.1346,-1.77209,52.4129,380965,13606
11337,Saina Nehwal,Female,1990,India,Athlete,10.6122,75.7167,29.1667,2255010,86731
11338,Rūta Meilutytė,Female,1997,Lithuania,Swimmer,10.3821,23.88611,54.9,342719,10710
11339,Vladimír Weiss,Male,1989,Slovakia,Soccer Player,10.2495,17.10972,48.14389,12479,462
11340,Missy Franklin,Female,1995,United States,Swimmer,9.8794,-118.13194,34.15611,1894716,72874


## Exploration

In [111]:
# Which historic characters are in the top ten most popular?
dfTopTenMostPopularPeople = dfCleanData.sort_values(by = ['popularity'], ascending = False)
dfTopTenMostPopularPeople = dfTopTenMostPopularPeople.head(10)
dfTopTenMostPopularPeople.loc[:,columnToDisplayCleanData].head(10)

Unnamed: 0,full_name,sex,birth_year,country,occupation,popularity,longitude,latitude,page_views,average_views
0,Aristotle,Male,-384,Greece,Philosopher,31.9938,23.5,40.33333,56355172,370758
1,Plato,Male,-427,Greece,Philosopher,31.9888,23.71667,37.96667,46812003,329662
2,Jesus Christ,Male,-4,Israel,Religious Figure,31.8981,34.9,32.5,60299092,281771
3,Socrates,Male,-469,Greece,Philosopher,31.6521,23.71667,37.96667,40307143,294213
4,Alexander the Great,Male,0,Greece,Military Personnel,31.584,22.51667,40.8,48358148,350421
5,Leonardo da Vinci,Male,1452,Italy,Inventor,31.4644,10.91667,43.78333,88931135,511098
6,Confucius,Male,-551,China,Philosopher,31.3705,0.0,0.0,22363652,116477
7,Julius Caesar,Male,-100,Italy,Politician,31.1161,12.5,41.9,43088745,336631
8,Homer,Male,-800,Turkey,Writer,31.1087,27.13917,38.41861,20839405,147797
9,Pythagoras,Male,-570,Greece,Philosopher,31.0691,26.83333,37.75,26168219,229546


In [48]:
dfTotalTop10ByContinentByCountry = pd.DataFrame(dfTopTenMostPopularPeople.groupby(['continent', 'country'])['article_id'].count())
dfTotalTop10ByContinentByCountry.rename(columns={'article_id': 'total_count'}, inplace=True)
dfTotalTop10ByContinentByCountry 

Unnamed: 0_level_0,Unnamed: 1_level_0,total_count
continent,country,Unnamed: 2_level_1
Asia,China,1
Asia,Israel,1
Europe,Greece,5
Europe,Italy,2
Europe,Turkey,1


In [105]:
# Which historic characters are in the top ten least popular?
dfTopTenLeastPopularPeople = dfCleanData.sort_values(by = ['popularity'], ascending = True)
dfTopTenLeastPopularPeople = dfTopTenLeastPopularPeople.head(10)
dfTopTenLeastPopularPeople.loc[:,columnToDisplayCleanData].head(10)

Unnamed: 0,full_name,sex,birth_year,country,occupation,popularity
11340,Missy Franklin,Female,1995,United States,Swimmer,9.8794
11339,Vladimír Weiss,Male,1989,Slovakia,Soccer Player,10.2495
11338,Rūta Meilutytė,Female,1997,Lithuania,Swimmer,10.3821
11337,Saina Nehwal,Female,1990,India,Athlete,10.6122
11336,Sean St Ledger,Male,1984,United Kingdom,Soccer Player,11.1346
11335,Jetro Willems,Male,1994,Netherlands,Soccer Player,11.3956
11334,Rebecca Soni,Female,1987,United States,Swimmer,11.405
11333,Sun Yang,Male,1991,China,Swimmer,11.6234
11332,Shane Long,Male,1987,Ireland,Soccer Player,11.7174
11331,Marc Albrighton,Male,1989,United Kingdom,Soccer Player,11.7258


In [106]:
dfTotalTop10LByContinentByCountry = pd.DataFrame(dfTopTenLeastPopularPeople.groupby(['continent', 'country'])['article_id'].count())
dfTotalTop10LByContinentByCountry.rename(columns={'article_id': 'total_count'}, inplace=True)
dfTotalTop10LByContinentByCountry 

Unnamed: 0_level_0,Unnamed: 1_level_0,total_count
continent,country,Unnamed: 2_level_1
Asia,China,1
Asia,India,1
Europe,Ireland,1
Europe,Lithuania,1
Europe,Netherlands,1
Europe,Slovakia,1
Europe,United Kingdom,2
North America,United States,2


In [102]:
# Getting the count by country, occupation, industry, and domain
dfTotalByCountryByIndustry = pd.DataFrame(dfCleanData.groupby(['industry','occupation'])['article_id'].count())
# Renaming the default column name the dataframe created to a meaningfull name
dfTotalByCountryByIndustry.rename(columns={'article_id': 'total_count'}, inplace=True)
dfTotalByCountryByIndustry.sort_values(by = ['total_count'], ascending = False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_count
industry,occupation,Unnamed: 2_level_1
Government,Politician,2530
Film And Theatre,Actor,1193
Team Sports,Soccer Player,1064
Language,Writer,954
Religion,Religious Figure,518
Music,Singer,437
Music,Musician,381
Philosophy,Philosopher,281
Natural Sciences,Physicist,268
Music,Composer,225


In [103]:
# Getting the count by countinent and by country
dfTotalByContinentByCountry = pd.DataFrame(dfCleanData.groupby(['continent', 'country'])['article_id'].count())
# Renaming the default column name the dataframe created to a meaningfull name
dfTotalByContinentByCountry.rename(columns={'article_id': 'total_count'}, inplace=True)
dfTotalByContinentByCountry.sort_values(by = ['total_count'], ascending = False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_count
continent,country,Unnamed: 2_level_1
North America,United States,2168
Europe,United Kingdom,1145
Europe,France,866
Europe,Italy,808
Europe,Germany,747
Unknown,Unknown,405
Europe,Russia,374
Europe,Spain,296
Asia,Turkey,202
Europe,Poland,173


In [122]:
# Getting the count by countinent
dfTotalByContinent = pd.DataFrame(dfCleanData.groupby(['continent'])['article_id'].count())
# Renaming the default column name the dataframe created to a meaningfull name
dfTotalByContinent.rename(columns={'article_id': 'total_count'}, inplace=True)
dfTotalByContinent.sort_values(by = ['total_count'], ascending = False)

Unnamed: 0_level_0,total_count
continent,Unnamed: 1_level_1
Europe,6368
North America,2439
Asia,1188
Africa,419
Unknown,408
South America,366
Oceania,123


## Modeling

# Results

# Conclusion