# Part 2
## The CSV Package
### Reading and Writing CSV files


In [1]:
import csv

In [2]:
with open('/Users/zeynoyalcin/Downloads/survey-responses.csv') as csvfile:
    reader = csv.reader(csvfile)
    rows = [row for row in reader] 
    
print(len(rows))

18


In [3]:
rows[1]

['2020', '16', '167', '59', 'Tea', 'Otter', 'The Arrival', 'Margaret Atwood']

In [4]:
with open('/Users/zeynoyalcin/Downloads/survey-responses.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    rows = [row for row in reader] 
    
len(rows)

17

In [5]:
rows[1]

OrderedDict([('Your class year:', '2020'),
             ('Your favorite number:', '17'),
             ('Your height in centimeters:', '152'),
             ('Your weight in kilogram:', '66'),
             ('Your favorite drink:', 'Water'),
             ('What is your favorite animal?', 'Cat'),
             ('What is your favorite movie?', 'North by Northwest'),
             ('Who is your favorite woman author?', 'Tamora Pierce')])

In [6]:
import json
with open('/Users/zeynoyalcin/Downloads/women2020.json') as file:
    data= json.load(file)
    
from collections import defaultdict
women2020Dict = defaultdict(list)

import csv

In [7]:
import csv

for name in women2020Dict:
    with open(str(name)+'.csv', 'w') as csvfile:
        fieldnames = women2020Dict[name][0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i in range(len(women2020Dict[name])):
            writer.writerow(women2020Dict[name][i])

In [8]:
url = 'https://www.politico.com/story/2018/12/17/poll-klobuchar-rising-in-iowa-1067256'

from urllib.parse import urlparse

result = urlparse(url)
print(result.netloc)

www.politico.com


In [9]:
from urllib.parse import urlparse
from collections import Counter
uniqueArticlesDict= {}

def sourcesFromFile(file): 
    d= defaultdict(Counter)
    with open(file, newline= '') as file:
        reader = csv.DictReader(file, delimiter=',')
        for row in reader:
            url = row['url']
            source = urlparse(url).netloc
            d[source][url] += 1
    names = file.name.split()
    lastnames = names[1].replace(".csv","")
            
    with open(lastnames+ '-media.json','w') as f:
        json.dump(d, f)
        return d 

In [10]:
elizabeth_dict = sourcesFromFile("Elizabeth Warren.csv")
kirsten_dict = sourcesFromFile("Kirsten Gillibrand.csv")
amy_dict= sourcesFromFile("Amy Klobuchar.csv")
kamala_dict= sourcesFromFile("Kamala Harris.csv")

In [11]:
def getTop5Sources(mediaDict): 
    counterDict= Counter()
    for mediaName in mediaDict:
            counterDict[mediaName] = len(mediaDict[mediaName])
            #the length gives you the number of unqiue keys
    top5= counterDict.most_common(5)
    return(top5)         

In [12]:
getTop5Sources(kirsten_dict)

[('thehill.com', 112),
 ('www.foxnews.com', 71),
 ('www.washingtonexaminer.com', 65),
 ('www.politico.com', 48),
 ('www.cnn.com', 45)]

In [13]:
getTop5Sources(amy_dict)

[('thehill.com', 103),
 ('www.cnn.com', 60),
 ('www.startribune.com', 57),
 ('www.washingtonpost.com', 49),
 ('www.politico.com', 41)]

In [14]:
getTop5Sources(elizabeth_dict)

[('thehill.com', 192),
 ('www.foxnews.com', 138),
 ('www.cnn.com', 125),
 ('www.washingtonpost.com', 113),
 ('www.washingtonexaminer.com', 97)]

In [15]:
getTop5Sources(kamala_dict)

[('thehill.com', 180),
 ('www.cnn.com', 142),
 ('www.foxnews.com', 119),
 ('www.politico.com', 118),
 ('www.washingtonexaminer.com', 86)]

# Part 3 
## Simple Data Science Tasks 
### Problem 1: Calculate descriptive statistics for a list of numbers

In [16]:
import pandas as pd
numbers = [10, 14, 11, 11, 9, 8, 7, 10, 11]
numS = pd.Series(numbers)
print(numS.describe())

count     9.000000
mean     10.111111
std       2.027588
min       7.000000
25%       9.000000
50%      10.000000
75%      11.000000
max      14.000000
dtype: float64


In [17]:
import math
count = len(numbers)
print("N (count): ", count)
mean = sum(numbers)/count
print("mean: ",mean)
x = 0
for i in numbers:
    x+= (i-mean)**2
std = math.sqrt(x/count)
print("Standard Deviation: ",std)
sorted_num = sorted(numbers)
index25 = int(25/100 * count)
index50 = int(50/100 *count)
index75 = int(75/100 * count)
print("minimum: ",sorted_num[0])
print("25%: ",sorted_num[index25])
print("50%: ",sorted_num[index50])
print("75%: ",sorted_num[index75])
print("maximum: ", sorted_num[len(sorted_num)-1])

N (count):  9
mean:  10.11111111111111
Standard Deviation:  1.9116278371205837
minimum:  7
25%:  9
50%:  10
75%:  11
maximum:  14


In [18]:
import string
twentyLetters =  string.ascii_uppercase[:20]

In [19]:
import random

#Algorithym 1: Shuffling to rearrange and then put them in buckets of 5 
from random import shuffle
shuffled_nums = [twentyLetters[i] for i in range(len(twentyLetters))]
shuffle(shuffled_nums)

for i in range(0,len(shuffled_nums),5):
    print(shuffled_nums[i:i+5])

['G', 'J', 'C', 'I', 'O']
['P', 'T', 'K', 'A', 'N']
['B', 'S', 'E', 'F', 'R']
['L', 'H', 'Q', 'D', 'M']


In [20]:
#Algorithym 2: Sampling without replacement 
randnumsList = random.sample(range(len(twentyLetters)), len(twentyLetters))
groups = []

for i in randnumsList:
    index = randnumsList[i]
    groups.append(twentyLetters[index])

counter = 0
while counter<20:
    grouper = groups[counter:counter+5]
    counter+=5
    print(grouper)


['S', 'C', 'K', 'H', 'O']
['F', 'R', 'N', 'G', 'D']
['E', 'Q', 'L', 'T', 'B']
['P', 'M', 'A', 'I', 'J']
