# Topic 03: Data Analysis in Base Python

### Lambdas

For sorting, maps, and filters.

In [1]:
# 1. Sorting!! 

import requests
macbeth = requests.get('http://www.gutenberg.org/cache/epub/2264/pg2264.txt').text
# Split the transcript into words
words = macbeth.split()
# Create a dictionary
word_counts = {}
# Iterate through the text of Macbeth
for word in words:
    # Update word counts
    word_counts[word] = word_counts.get(word, 0) + 1 #Get previous entry, update by 1
# Convert to a list
counts = list(word_counts.items())


In [2]:
counts

[('\ufeff', 1),
 ('***The', 2),
 ('Project', 23),
 ("Gutenberg's", 3),
 ('Etext', 5),
 ('of', 395),
 ("Shakespeare's", 6),
 ('First', 4),
 ('Folio***', 2),
 ('********************The', 2),
 ('Tragedie', 6),
 ('Macbeth*********************', 2),
 ('*******************************************************************', 2),
 ('THIS', 4),
 ('EBOOK', 2),
 ('WAS', 1),
 ('ONE', 1),
 ('OF', 9),
 ('PROJECT', 6),
 ("GUTENBERG'S", 1),
 ('EARLY', 1),
 ('FILES', 1),
 ('PRODUCED', 1),
 ('AT', 1),
 ('A', 29),
 ('TIME', 1),
 ('WHEN', 1),
 ('PROOFING', 1),
 ('METHODS', 1),
 ('AND', 1),
 ('TOOLS', 1),
 ('WERE', 1),
 ('NOT', 6),
 ('WELL', 1),
 ('DEVELOPED.', 1),
 ('THERE', 1),
 ('IS', 2),
 ('AN', 1),
 ('IMPROVED', 1),
 ('EDITION', 1),
 ('TITLE', 1),
 ('WHICH', 1),
 ('MAY', 3),
 ('BE', 2),
 ('VIEWED', 1),
 ('AS', 2),
 ('(#1533)', 1),
 ('at', 54),
 ('https://www.gutenberg.org/ebooks/1533', 1),
 ('This', 34),
 ('is', 185),
 ('our', 116),
 ('3rd', 1),
 ('edition', 2),
 ('most', 25),
 ('these', 30),
 ('plays.'

In [3]:
# Sort words by count
top_25 = sorted(counts, key = lambda x: x[1], reverse=True)[:25] # x[1] points to the second element in each tuple

In [4]:
# 2. Maps and filters

l = [1, 2, 3, 4, 5, 6]

In [5]:
list(map(lambda x: x + 2, l)) # MAPPING x + 2 to every element of the list

[3, 4, 5, 6, 7, 8]

In [6]:
list(filter(lambda x: x%2 == 0, l)) # FILTERING and only keeping elements that satisfy the condition

[2, 4, 6]

## Making Lists from Lists
List comprehension (and dictionary comprehension!)

In [7]:
new = []
for x in l:
    new.append(x+2)
new

[3, 4, 5, 6, 7, 8]

In [8]:
[x+2 for x in l]

[3, 4, 5, 6, 7, 8]

In [9]:
{x: x+1 for x in l}

{1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7}

In [10]:
[x+2 for x in l if x%2==0] # adding a condition to only do x+2 if x is even

[4, 6, 8]

## Nested Dictionaries

Nested dictionaries are a very common data structure, especially once you start working with JSON and APIs.

In [11]:
soccer_match = [
  { "home_team": True,
    "away_team": False,
    "country": "France",
    "num_passes": 484,
    "passes_completed": 423,
    "fouls_committed": 16,
    "colors": ["blue", "white", "red"],
    "players": [
      {
        "name": "Hugo LLORIS",
        "captain": True,
        "shirt_number": 1,
        "position": "Goalie"
      },
      {
        "name": "Benjamin PAVARD",
        "captain": False,
        "shirt_number": 2,
        "position": "Defender"
      },
      {
        "name": "Raphael VARANE",
        "captain": False,
        "shirt_number": 4,
        "position": "Defender"
      },
      {
        "name": "Samuel UMTITI",
        "captain": False,
        "shirt_number": 5,
        "position": "Defender"
      },
      {
        "name": "Paul POGBA",
        "captain": False,
        "shirt_number": 6,
        "position": "Midfield"
      },
      {
        "name": "Antoine GRIEZMANN",
        "captain": False,
        "shirt_number": 7,
        "position": "Forward"
      },
      {
        "name": "Kylian MBAPPE",
        "captain": False,
        "shirt_number": 10,
        "position": "Forward"
      },
      {
        "name": "Ousmane DEMBELE",
        "captain": False,
        "shirt_number": 11,
        "position": "Forward"
      },
      {
        "name": "Corentin TOLISSO",
        "captain": False,
        "shirt_number": 12,
        "position": "Midfield"
      },
      {
        "name": "Ngolo KANTE",
        "captain": False,
        "shirt_number": 13,
        "position": "Midfield"
      },
      {
        "name": "Lucas HERNANDEZ",
        "captain": False,
        "shirt_number": 21,
        "position": "Defender"
      }
    ],
  },
  { "home_team": False,
    "away_team": True,
    "country": "Australia",
    "num_passes": 390,
    "passes_completed": 332,
    "fouls_committed": 19,
    "colors": ["green", "gold"],
    "players": [
      {
        "name": "Mathew RYAN",
        "captain": False,
        "shirt_number": 1,
        "position": "Goalie"
      },
      {
        "name": "Mark MILLIGAN",
        "captain": False,
        "shirt_number": 5,
        "position": "Defender"
      },
      {
        "name": "Mathew LECKIE",
        "captain": False,
        "shirt_number": 7,
        "position": "Forward"
      },
      {
        "name": "Robbie KRUSE",
        "captain": False,
        "shirt_number": 10,
        "position": "Forward"
      },
      {
        "name": "Andrew NABBOUT",
        "captain": False,
        "shirt_number": 11,
        "position": "Forward"
      },
      {
        "name": "Aaron MOOY",
        "captain": False,
        "shirt_number": 13,
        "position": "Midfield"
      },
      {
        "name": "Mile JEDINAK",
        "captain": True,
        "shirt_number": 15,
        "position": "Midfield"
      },
      {
        "name": "Aziz BEHICH",
        "captain": False,
        "shirt_number": 16,
        "position": "Defender"
      },
      {
        "name": "Joshua RISDON",
        "captain": False,
        "shirt_number": 19,
        "position": "Defender"
      },
      {
        "name": "Trent SAINSBURY",
        "captain": False,
        "shirt_number": 20,
        "position": "Defender"
      },
      {
        "name": "Tom ROGIC",
        "captain": False,
        "shirt_number": 23,
        "position": "Midfield"
      }
    ]
  }
]

In [12]:
# explore the data - its schema
print(type(soccer_match))
print(len(soccer_match))

print(soccer_match[0])


<class 'list'>
2
{'home_team': True, 'away_team': False, 'country': 'France', 'num_passes': 484, 'passes_completed': 423, 'fouls_committed': 16, 'colors': ['blue', 'white', 'red'], 'players': [{'name': 'Hugo LLORIS', 'captain': True, 'shirt_number': 1, 'position': 'Goalie'}, {'name': 'Benjamin PAVARD', 'captain': False, 'shirt_number': 2, 'position': 'Defender'}, {'name': 'Raphael VARANE', 'captain': False, 'shirt_number': 4, 'position': 'Defender'}, {'name': 'Samuel UMTITI', 'captain': False, 'shirt_number': 5, 'position': 'Defender'}, {'name': 'Paul POGBA', 'captain': False, 'shirt_number': 6, 'position': 'Midfield'}, {'name': 'Antoine GRIEZMANN', 'captain': False, 'shirt_number': 7, 'position': 'Forward'}, {'name': 'Kylian MBAPPE', 'captain': False, 'shirt_number': 10, 'position': 'Forward'}, {'name': 'Ousmane DEMBELE', 'captain': False, 'shirt_number': 11, 'position': 'Forward'}, {'name': 'Corentin TOLISSO', 'captain': False, 'shirt_number': 12, 'position': 'Midfield'}, {'name': 'N

#### Write a function that counts how many players have even shirt numbers:

Hints to make your life easier: 
- What is annoying about the data in the structure it currently is in?
- You can 100% write code to change the data to make it easier to work with
- You can write **helper functions** to do this task

In [47]:
player_numbers = []

for i in range(len(soccer_match)):
    for p in range(len(soccer_match[i]['players'])):
        player_numbers.append((soccer_match[i]['players'][p]['name'], soccer_match[i]['players'][p]['shirt_number']))

even_count = 0
        
for i in range(len(player_numbers)):
    if player_numbers[i][1] % 2 == 0:
        even_count += 1
print(f'There are {even_count} players with even shirt numbers.')

There are 8 players with even shirt numbers.


#### Write a function that appends player names and shirt numbers to a list until the sum of shirt numbers reach 100:

In [50]:
one_hundred = []

counter = 0

for i in range(len(player_numbers)):
    if counter < 100:
        counter = counter + player_numbers[i][1]
        one_hundred.append(player_numbers[i])
    else:
        break

print(counter)
print(one_hundred)
print(player_numbers)

105
[('Hugo LLORIS', 1), ('Benjamin PAVARD', 2), ('Raphael VARANE', 4), ('Samuel UMTITI', 5), ('Paul POGBA', 6), ('Antoine GRIEZMANN', 7), ('Kylian MBAPPE', 10), ('Ousmane DEMBELE', 11), ('Corentin TOLISSO', 12), ('Ngolo KANTE', 13), ('Lucas HERNANDEZ', 21), ('Mathew RYAN', 1), ('Mark MILLIGAN', 5), ('Mathew LECKIE', 7)]
[('Hugo LLORIS', 1), ('Benjamin PAVARD', 2), ('Raphael VARANE', 4), ('Samuel UMTITI', 5), ('Paul POGBA', 6), ('Antoine GRIEZMANN', 7), ('Kylian MBAPPE', 10), ('Ousmane DEMBELE', 11), ('Corentin TOLISSO', 12), ('Ngolo KANTE', 13), ('Lucas HERNANDEZ', 21), ('Mathew RYAN', 1), ('Mark MILLIGAN', 5), ('Mathew LECKIE', 7), ('Robbie KRUSE', 10), ('Andrew NABBOUT', 11), ('Aaron MOOY', 13), ('Mile JEDINAK', 15), ('Aziz BEHICH', 16), ('Joshua RISDON', 19), ('Trent SAINSBURY', 20), ('Tom ROGIC', 23)]


## Reading a JSON Schema

Here's the JSON schema provided for a section of the NY Times API:

<img src="images/schema_detailed.png" width="300">

In [None]:
import json

# here's how we open a file in Python
f = open('ny_times_response.json', 'r')
data = json.load(f)

In [None]:
print(type(data))
print(data.keys())

### Getting New York Times Headlines

In [None]:
data['response']['docs'][0].keys()

In [None]:
docs = data['response']['docs']
print(type(docs), len(docs))

In [None]:
for doc in docs:
    print(doc['headline'])

In [None]:
for doc in docs:
    print(doc['headline']['main'])
    print('\n')

### Turning JSON into DataFrames

Pandas can take a **list** of **dictionaries** and automatically format that into a DataFrame, where keys become column headers. However, note what **headline** looks like.

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data['response']['docs'])
df

In [None]:
keys = df.headline.iloc[0].keys() #Get dictionary keys

# Keep track of columns we make for subsequent preview
new_cols = []

# Create a new feature for each of these keys
# we'll talk about this code more in Topic 4/5!

for key in keys:
    new_col = f'headline_{key}' #Create new column name
    df[new_col] = df.headline.map(lambda x: x[key]) #Create a new column
    new_cols.append(new_col)
df[new_cols].head()

In [None]:
# here's the code to output a JSON file
# or to output any dictionary you have as a JSON file

with open('soccer.json', 'w') as f:
    json.dump(soccer_match, f)