In [23]:
import pandas as pd
from scipy import stats

df = pd.read_excel('C:/Users/Ted/Downloads/FinancialSample.xlsx')
#df = pd.read_csv('C:/Users/Ted/Downloads/FinancialSample.csv')

target = 'Country'

# for every column that is not the target column, check the dtype of that 
# column and the target column and see if they are both object
for column in df.columns:
    if target != column:
        #object vs object: chi2
        if df[target].dtype == 'object' and df[column].dtype == 'object':
            print(f'Target: {target}')
            print(f'Column: {column}')
            print()
            ct = pd.crosstab(df[target], df[column])
            print(ct)
            print()
            
            x, p, dof, expected = stats.chi2_contingency(ct)
            critical_value = stats.chi2.ppf(1 - 0.05, dof)
            
            print(f'x2: {x}')
            print(f'Critical Value: {critical_value}')          # this is the value that chi squared must be below or above to determine if it is high or low
            print(f'p: {p}')
            
            print()
            print('Conclusion:')
            
            if p < 0.05:
                if x > critical_value:
                    print('Correlation')
                    print()
                    print('p value is below the significance threshold of 0.05')
                    print('chi2 value is greater than critical value')
                    print('the difference between observed and expected frequencies is too large to be attributed to chance')

                else: 
                    print('No Correlation')
                    print()
                    print('p value is above the significance threshold of 0.05')
                    print('chi2 value is less than critical value')
                    print('the difference between observed and expected frequencies is too small and could be attributed to chance')
                    
            else:
                print('No Correlation')
                print()
                print('p value is above the significance threshold of 0.05')
            
            print()
            print('---------------------------------------------------------')
            print()
            

Target: Country
Column: Segment

Segment                   Channel Partners  Enterprise  Government  Midmarket  \
Country                                                                         
Canada                                  20          20          60         20   
France                                  20          20          60         20   
Germany                                 20          20          60         20   
Mexico                                  20          20          60         20   
United States of America                20          20          60         20   

Segment                   Small Business  
Country                                   
Canada                                20  
France                                20  
Germany                               20  
Mexico                                20  
United States of America              20  

x2: 0.0
Critical Value: 26.29622760486423
p: 1.0

Conclusion:
No Correlation

p value is above th

In [24]:
import pandas as pd
from scipy import stats

df = pd.read_excel('C:/Users/Ted/Downloads/FinancialSample.xlsx')
#df = pd.read_csv('C:/Users/Ted/Downloads/FinancialSample.csv')

target = 'Units Sold'

# for every column that is not the target column, check the dtype of that 
# column and the target column and see if they are both object
for column in df.columns:
    if target != column: 
        #numerical vs numerical: pearsons or spearmans
        if (df[target].dtype == 'float64' or df[target].dtype == 'int64') and (df[column].dtype == 'float64' or df[column].dtype == 'int64'):
            x, p = stats.pearsonr(df[target], df[column])       # can use either pearsonr or spearmanr depending on how the data is distributed
            print(f'Target: {target}')
            print(f'Column: {column}')
            print()
            
            print(f'x: {x}')
            print(f'p: {p}')
            
            print()
            
            if p < 0.05:
                print('Correlation')
                print('p value is below the significance threshold of 0.05')    
            else:
                print('No Correlation')
                print('Reason: p value is above the significance threshold of 0.05')
            
            print()
            print('---------------------------------------------------------')
            print()


Target: Units Sold
Column: Manufacturing Price

x: -0.029643972853024458
p: 0.4335823652264368

No Correlation
Reason: p value is above the significance threshold of 0.05

---------------------------------------------------------

Target: Units Sold
Column: Sale Price

x: -0.0650658134215006
p: 0.0853913408043615

No Correlation
Reason: p value is above the significance threshold of 0.05

---------------------------------------------------------

Target: Units Sold
Column: Gross Sales

x: 0.32722066254045745
p: 6.232960229375536e-19

Correlation
p value is below the significance threshold of 0.05

---------------------------------------------------------

Target: Units Sold
Column: Discounts

x: 0.2530479774656188
p: 1.0911815841023358e-11

Correlation
p value is below the significance threshold of 0.05

---------------------------------------------------------

Target: Units Sold
Column: Sales

x: 0.3269139725274527
p: 6.747469128497393e-19

Correlation
p value is below the significan

In [25]:
import pandas as pd
from scipy import stats

df = pd.read_excel('C:/Users/Ted/Downloads/FinancialSample.xlsx')
#df = pd.read_csv('C:/Users/Ted/Downloads/FinancialSample.csv')

target = 'Sales'

# for every column that is not the target column, check the dtype of that 
# column and the target column and see if they are both object
for column in df.columns:
    if target != column: 
        #object vs numerical: ANOVA
        #There can be no correlation between nominal vs numerical, only something similar 
        if (df[target].dtype == 'object' and (df[column].dtype == 'float64' or df[column].dtype == 'int64')) or ((df[target].dtype == 'float64' or df[target].dtype == 'int64') and df[column].dtype == 'object'):    
            
            print(f'Target: {target}')
            print(f'Column: {column}')  
            
            if df[target].dtype == 'object':
                grouped_data = df.groupby(target)[column].apply(list)
            else:
                grouped_data = df.groupby(column)[target].apply(list)

            # Convert the grouped data into a list of arrays suitable for ANOVA
            data_arrays = [group for group in grouped_data]

            # Perform ANOVA
            f, p = stats.f_oneway(*data_arrays)
            # Finding degrees of freedom for both numerator and demoninator, then using those to help find critical value
            dfn, dfd = (2-1), (len(df.index)-2)      # (2-1) because we are only comparing 2 groups
            critical_value = stats.f.ppf(1 - 0.05, dfn, dfd)
            
            print()
            print("f:", f)
            print(f"Critical Value: {critical_value}", )
            print("p:", p)
            print()
            
            if p < 0.05:
                if f > critical_value:
                    print('Correlation')
                    print()
                    print('p value is below the significance threshold of 0.05')
                    print('f value is greater than critical value')
                    print('the difference between observed and expected frequencies is too large to be attributed to chance')

                else: 
                    print('No Correlation')
                    print()
                    print('p value is above the significance threshold of 0.05')
                    print('f value is less than critical value')
                    print('the difference between observed and expected frequencies is too small and could be attributed to chance')
                    
            else:
                print('No Correlation')
                print()
                print('p value is above the significance threshold of 0.05')
            
            print()
            print('---------------------------------------------------------')
            print()    

Target: Sales
Column: Segment

f: 67.74518604971661
Critical Value: 3.8548156889312546
p: 2.0255068154252574e-48

Correlation

p value is below the significance threshold of 0.05
f value is greater than critical value
the difference between observed and expected frequencies is too large to be attributed to chance

---------------------------------------------------------

Target: Sales
Column: Country

f: 0.35559570011666586
Critical Value: 3.8548156889312546
p: 0.8401927154078508

No Correlation

p value is above the significance threshold of 0.05

---------------------------------------------------------

Target: Sales
Column: Product

f: 0.43821963821158566
Critical Value: 3.8548156889312546
p: 0.8219373484795555

No Correlation

p value is above the significance threshold of 0.05

---------------------------------------------------------

Target: Sales
Column: Discount Band

f: 2.781342799315414
Critical Value: 3.8548156889312546
p: 0.04019543188543751

No Correlation

p value is a

In [26]:
import pandas as pd
from scipy import stats

df = pd.read_excel('C:/Users/Ted/Downloads/FinancialSample.xlsx')
#df = pd.read_csv('C:/Users/Ted/Downloads/FinancialSample.csv')

target = 'Country'

# for every column that is not the target column, check the dtype of that 
# column and the target column and see if they are both object
for column in df.columns:
    if target != column:       
        #anything vs datetime: Time Series Analysis
        if (df[target].dtype == 'object' or df[target].dtype == 'float64' or df[target].dtype == 'int64') and df[column].dtype == 'datetime64[ns]':
        #or df[target].dtype == 'datetime64[ns]' and (df[column].dtype == 'object' or df[column].dtype == 'float64' or df[column].dtype == 'int64')
            
            print(f'Target: {target}')
            print(f'Column: {column}')  
            print('Anything vs datetime')
            # need to add analysis, maybe graph, summary statistics, crosstab, etc?
            
            

Target: Country
Column: Date
Anything vs datetime


In [27]:
print()
for column in df.columns:
    print(f'{df[column].dtype} - {column}')


object - Segment
object - Country
object - Product
object - Discount Band
float64 - Units Sold
int64 - Manufacturing Price
int64 - Sale Price
float64 - Gross Sales
float64 - Discounts
float64 - Sales
float64 - COGS
float64 - Profit
datetime64[ns] - Date
int64 - Month Number
object - Month Name
int64 - Year


In [28]:
#ct = pd.crosstab(df['Country'], df['Product'])
#print(ct)
#print()

#stat, p, dof, expected = stats.chi2_contingency(ct)
#critical_value = stats.chi2.ppf(1 - 0.05, dof)

#print(stat)
#print(critical_value)          # this is the value that chi squared must be below or above to determine if it is high or low
#print(p)


#tvar = df[target]           # target variable


#res = stats.spearmanr(x, y)
#print(res)



#segment_data = df['Segment']
#sales_data = df['Sales']
#gross_sales_data = df['Gross Sales']


#f_value, p_value = stats.f_oneway(segment_data, sales_data, gross_sales_data)
#f_value, p_value = stats.f_oneway(df['Segment'], df['Sales'], df['Gross Sales'])
#print("F-value:", f_value, "P-value:", p_value)


#import numpy as np
#import scipy.stats

#x = np.array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
#y = np.array([2, 1, 4, 5, 8, 12, 18, 25, 96, 48])

#scipy.stats.pearsonr(x, y)    # Pearson's r
#scipy.stats.spearmanr(x, y)   # Spearman's rho
#scipy.stats.kendalltau(x, y)  # Kendall's tau


#out = scipy.stats.spearmanr(x, y)
#print(out)






In [29]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Sample DataFrame with categorical features
data = {'X1': [1, 2, 3, 4, 5],
        'X2': [5, 4, 3, 2, 1],
        'X3': [2, 3, 4, 5, 6],
        'y': [1, 0, 1, 0, 1]}
df = pd.DataFrame(data)

# Separate features and target
X = df.drop('y', axis=1)
y = df['y']

# Apply SelectKBest with chi-squared test
k = 2  # Number of top features to select
chi2_selector = SelectKBest(chi2, k=k)
X_kbest = chi2_selector.fit_transform(X, y)

# Get selected feature names
selected_features = [X.columns[i] for i in chi2_selector.get_support(indices=True)]

# Results
print("Selected features based on chi-squared test:", selected_features)

Selected features based on chi-squared test: ['X2', 'X3']
