# Pandas 1
## September 6th, 2022
### Overview: Pandas reading from files, indexing, datetime

In [1]:
import numpy as np
import pandas as pd
import datetime 

# Problem 1

In [3]:
# Prob 1
def prob1(file='budget.csv'):
    """"
    Read in budget.csv as a DataFrame with the index as column 0 and perform each of these operations on the DataFrame in order. 
    
    1) Reindex the columns such that amount spent on groceries is the first column and all other columns maintain the same ordering.
    2) Sort the DataFrame in descending order based on how much money was spent on Groceries.
    3) Reset all values in the 'Rent' column to 800.0.
    4) Reset all values in the first 5 data points to 0.0
    
    Return the values of the updated DataFrame as a NumPy array.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (ndarray): values of DataFrame
    """
    #read in file as dataframe with first column serving as inded
    budget = pd.read_csv(file,index_col=0)
    
    #reorder the columns so that groceries are first, otherwise same order
    budget = budget.reindex(columns = ['Groceries', 'Rent', 'Utilities', 'Dining Out', 'Gas', 'Out With Friends', 'Netflix'])
    
    #sort in descending order of groceries
    budget.sort_values('Groceries',ascending=False,inplace=True)
    
    #set rent column to 800
    budget['Rent'] = 800.0
    
    #set all values in the first 5 data points to 0
    budget.iloc[[0,1,2,3,4]] = 0.
    
    #return df as numpy array
    return budget.values

In [4]:
values1 = prob1()

# Problem 2

In [3]:
# Prob 2
def prob2(file='budget.csv'):
    """
    Read in file as DataFrame.
    Fill all NaN values with 0.0.
    Create two new columns, 'Living Expenses' and 'Other'. 
    Sum the columns 'Rent', 'Groceries', 'Gas' and 'Utilities' and set it as the value of 'Living Expenses'.
    Sum the columns 'Dining Out', 'Out With Friends' and 'Netflix' and set as the value of 'Other'.
    Identify which column, other than 'Living Expenses' correlates most with 'Living Expenses'
    and which column other than 'Other' correlates most with 'Other'.

    Return the names of each of those columns as a tuple.
    The first should be of the column corresponding to \li{'Living Expenses'} and the second to \li{'Other'}.
    
    Parameters:
        file (str): name of datafile
        
    Return:
        values (tuple): (name of column that most relates to Living Expenses, name of column that most relates to Other)
    """
    #read in budget, NOTE: SHOULD INDEX COLUMN BE FIRST COLUMN AGAIN?
    budget = pd.read_csv(file,index_col=0)
    
    #fill nan values with 0
    budget = budget.fillna(0.)
    
    #create 'Living Expenses' and 'Other' columns, assign values appropriately
    #NOTE: when summing the columns, I believe we're also summing row-specific to assign unique values for LE and Other on each row
    budget['Living Expenses'] = budget[['Rent','Groceries','Gas','Utilities']].sum(axis=1)
    budget['Other'] = budget[['Dining Out','Out With Friends','Netflix']].sum(axis=1)
    
    #correlation matrix
    corr = budget.corr()
    
    #greatest non-self correlations for LE and Other
    LE_corr = corr.loc['Living Expenses'].sort_values().index[-2]
    Other_corr = corr.loc['Other'].sort_values().index[-2]
    
    return (LE_corr,Other_corr)

In [5]:
values2 = prob2()

# Problem 3

In [6]:
def prob3(file='crime_data.csv'):
    """
    Read in crime data and use pandas to answer the following questions.
    
    Set the index as the column 'Year', and return the answers to each question as a tuple.
    
    1) Identify the three crimes that have a mean over 1,500,000. 
    Of these three crimes, which two are very correlated? 
    Which of these two crimes has a greater maximum value?
    Save the title of this column as a variable to return as the answer.
    
    2) Examine the data since 2000.
    Sort this data (in ascending order) according to number of murders.
    Find the years where Aggravated Assault is greater than 850,000.
    Save the indices (the years) of the masked and reordered DataFrame as a NumPy array to return as the answer.
    
    3) What year had the highest crime rate? 
    In this year, which crime was committed the most? 
    What percentage of the total crime that year was it? 
    Save this value as a float.
    
    
    Parameters:
        file (str): data
    
    Return:
        ans_1 (string): answer to Question 1
        ans_2 (ndarray): answer to Question 2
        ans_3 (float): answer to Question 3
    """
    #read in crime, index col is year
    crime = pd.read_csv(file,index_col='Year')
    
    #Q1
    #where means for crimes are greater than 1,500,000
    mask = crime.mean() > 1.5e6
    cols = crime.columns[mask][2:]           #get columns matching mask, ignore first two entries Population and Total
    
    #unstack correlation to series, sort, get index pair of greatest value that isn't self corr (at -4 index)
    corr_pair = crime[cols].corr().unstack().sort_values().index[-4]
    
    #get sums of crimes, find which index has a greater count
    sums = crime.sum()
    #argmax finds index of max of [#Larceny, #Property], then plug this index into corr_pair
    ans_1 = corr_pair[np.argmax([sums[corr_pair[0]],sums[corr_pair[1]]])]
    
    #Q2
    #crimes from 2000 onwards, sort (ascending) according to murders
    murder_sort = crime.loc[2000:].sort_values('Murder')
    #which years have more than 850,000 AAs
    years = murder_sort['Aggravated Assault'] > 8.5e5
    #in the sorted order, get indices (years) and make into ndarray
    ans_2 = murder_sort.index[years].values
    
    #Q3
    #year with highest number of crimes
    crime['Crime Rate'] = crime['Total']/crime['Population']
    big_crime_year = crime['Crime Rate'].idxmax()
    #in this year, sort descending, ignore first two biggest which are always Population and Total, get index of biggest
    big_crime = crime.loc[big_crime_year].sort_values(ascending=False)[2:].idxmax()
    #get percentage this crime was committed, wants saved as float, not necessarily a numpy float
    ans_3 = float(crime.loc[big_crime_year][big_crime] / crime.loc[big_crime_year]['Total'])
    
    
    return (ans_1, ans_2, ans_3)

In [7]:
a1,a2,a3 = prob3()

In [21]:
prob3()

('Property',
 array([2000, 2001, 2002, 2003, 2005, 2007, 2006]),
 0.8997188308734142)

# Problem 4

In [10]:
def prob4(file='DJIA.csv',dropna=True):
    """

    Read the data with a DatetimeIndex as the index.
    Drop rows any rows without numerical values, cast the "VALUE" column to floats, then return the updated DataFrame.

    Parameters:
        file (str): data file
        dropna (bool): boolean to choose whether to drop the nan values, defaults to true
    Returns:
        df (DataFrame): updated DataFrame of stock market data
    """
    #read in file, cast values as floats, set all '.' values as nan
    djia = pd.read_csv(file,dtype={'VALUE':np.float64},na_values='.')
    
    #set index col to datetime of date column, then drop the original date column
    djia.set_index(pd.to_datetime(djia['DATE']),inplace=True)
    djia.drop(columns='DATE',inplace=True)
    
    #dropping rows with nan values if desired
    if dropna:
        djia.dropna(inplace=True)
    
    return djia

In [11]:
djia = prob4()

# Problem 5

In [16]:
def prob5(file='paychecks.csv'):
    """

    Create data_range for index of paycheck data.

    Parameters:
        file (str): data file
    Returns:
        df (DataFrame): DataFrame of paycheck data
    """
    #read in the file, name the column of pay value so as not to use first value as a column name
    pay = pd.read_csv(file,names=['Amount of Pay'])
    
    #create datetime index starting on March 13th, 2008, every other friday of the month, one entry for each 93 paychecks
    ind = pd.date_range(start='3/13/2008',periods=93,freq='2W-FRI')
    
    #set the index column to be datetime index, name the column as date
    pay.set_index(ind,inplace=True)
    pay.index.rename('Date',inplace=True)
    
    return pay

In [17]:
pay = prob5()

In [22]:
pay

Unnamed: 0_level_0,Amount of Pay
Date,Unnamed: 1_level_1
2008-03-14,1122.26
2008-03-28,921.03
2008-04-11,962.46
2008-04-25,1035.97
2008-05-09,1078.59
...,...
2011-07-29,1095.53
2011-08-12,1018.39
2011-08-26,1027.08
2011-09-09,1005.90


# Problem 6

In [19]:
def prob6(file='DJIA.csv'):
    """
    Compute the following information about the DJIA dataset
    1. The single day with the largest gain
    2. The single day with the largest loss

    Parameters:
        file (str): data file
    Returns:
        max_day (<M8[ns]): DateTimeIndex of maximum change
        min_day (<M8[ns]): DateTimeIndex of minimum change
    """
    #use prob 4 to get cleaned up df but now with the nan values to make sure differences will actually be adjacent days
    djia = prob4(file=file,dropna=False)
    
    #get absolute differences between dates
    diffs = djia - djia.shift(1)
    
    #get day of greatest gain and greatest loss
    imax = diffs.idxmax()
    imin = diffs.idxmin()

    return imax, imin

In [23]:
prob6()

(VALUE   2008-10-13
 dtype: datetime64[ns], VALUE   2008-09-29
 dtype: datetime64[ns])