In [1]:
import pandas as pd

def find_overlapping_columns(*dataframes):
    # Ensure that at least two DataFrames are provided
    if len(dataframes) < 2:
        raise ValueError("At least two DataFrames are required for finding overlaps.")

    # Get the columns of the first DataFrame
    overlapping_columns = set(dataframes[0].columns)

    # Find the intersection of columns with each subsequent DataFrame
    for df in dataframes[1:]:
        overlapping_columns = overlapping_columns.intersection(df.columns)

    return list(overlapping_columns)

def restrict_to_overlapping_columns(*dataframes):
    # Find the overlapping columns among all DataFrames
    overlapping_columns = find_overlapping_columns(*dataframes)
    print(overlapping_columns)
    # Restrict each DataFrame to the overlapping columns
    restricted_dataframes = []
    for df in dataframes:
        restricted_df = df[overlapping_columns]
        restricted_dataframes.append(restricted_df)

    return restricted_dataframes

# Example usage:
# Create three sample DataFrames with different columns
df1 = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [10, 20, 30, 40]})
df2 = pd.DataFrame({'B': [3, 4, 5, 6], 'A': [300, 400, 500, 600]})
df3 = pd.DataFrame({'A': [5, 6, 7, 8], 'C': [700, 800, 900, 1000]})

# Create a list of DataFrames
dataframes = [df1, df2, df3]

# Restrict DataFrames to overlapping columns
restricted_dataframes = restrict_to_overlapping_columns(*dataframes)

# Print restricted DataFrames
for i, df in enumerate(restricted_dataframes):
    print(f"DataFrame {i+1} (Restricted to Overlapping Columns):\n{df}\n")


['A']
DataFrame 1 (Restricted to Overlapping Columns):
   A
0  1
1  2
2  3
3  4

DataFrame 2 (Restricted to Overlapping Columns):
     A
0  300
1  400
2  500
3  600

DataFrame 3 (Restricted to Overlapping Columns):
   A
0  5
1  6
2  7
3  8



In [3]:
def temp():
    return [df1, df2, df3]

[a,b,c] = temp()

b


Unnamed: 0,B,A
0,3,300
1,4,400
2,5,500
3,6,600


In [4]:
def convert_values(input_value, value1_list, value2_list):
    if isinstance(input_value, list):
        # If input_value is a list, convert each element
        converted_list = [value2_list[value1_list.index(item)] if item in value1_list else item for item in input_value]
        return converted_list
    else:
        # If input_value is a single value, convert it
        if input_value in value1_list:
            index = value1_list.index(input_value)
            return value2_list[index]
        else:
            # Return the input_value unchanged if not found in value1_list
            return input_value

# Example usage:
value1_list = ['apple', 'banana', 'cherry']
value2_list = ['red', 'yellow', 'red']

# Convert a single value
converted_value = convert_values('banana', value1_list, value2_list)
print(f"Converted Value: {converted_value}")

# Convert a list of values
input_list = ['apple', 'banana', 'kiwi']
converted_list = convert_values(input_list, value1_list, value2_list)
print(f"Converted List: {converted_list}")


Converted Value: yellow
Converted List: ['red', 'yellow', 'kiwi']


In [6]:
def convert_values(input_value, from_list, to_list):
    """
    # Example usage (same as before):
    from_list = ['apple', 'banana', 'cherry']
    to_list = ['red', 'yellow', 'red']

    # Convert a single value
    converted_value = convert_values('banana', from_list, to_list)
    print(f"Converted Value: {converted_value}") 
    Converted Value: 'yellow'

    # Convert a list of values
    input_list = ['apple', 'banana', 'kiwi']
    converted_list = convert_values(input_list, from_list, to_list)
    print(f"Converted List: {converted_list}")
    > Converted List: ['red', 'yellow', 'kiwi']
    """
    # logging.info("Converting input by creating a dictionary to map values")
    value_mapping = dict(zip(from_list, to_list))

    if isinstance(input_value, list):
        # logging.debug("If input_value is a list, convert each element")
        return list(map(lambda x: value_mapping.get(x, x), input_value))
    else:
        # logging.debug("If input_value is a single value, convert it")
        return value_mapping.get(input_value, input_value)
    
# Example usage (same as before):
value1_list = ['apple', 'banana', 'cherry']
value2_list = ['red', 'yellow', 'red']

# Convert a single value
converted_value = convert_values('banana', value1_list, value2_list)
print(f"Converted Value: {converted_value}")

# Convert a list of values
input_list = ['apple', 'banana', 'kiwi']
converted_list = convert_values(input_list, value1_list, value2_list)
print(f"Converted List: {converted_list}")

Converted Value: yellow
Converted List: ['red', 'yellow', 'kiwi']


In [19]:
import numpy as np
def convert_with_tracking(input_value, from_list, to_list):
    """
    # Example usage:
    value1_list = ['apple', 'banana', 'cherry']
    value2_list = ['red', 'yellow', 'red']

    # Convert a single value
    conversion_result = convert_values('kiwi', value1_list, value2_list)
    print(f"Converted List: {conversion_result}")
    # > Converted List: 'yellow'
    # Convert a list of values
    input_list = ['apple', 'banana', 'kiwi']
    conversion_result = convert_values(input_list, value1_list, value2_list)
    print(f"Converted List: {conversion_result}")
    # > Converted List: ['red', 'yellow', 'kiwi']
    """

    logging.debug("Ensure from_list and to_list have the same length")
    if len(from_list) != len(to_list):
        raise ValueError("Input lists must have the same length.")

    logging.info("Converting input by creating a dictionary to map values from 'from_list' to 'to_list'")
    value_mapping = dict(zip(from_list, to_list))

    logging.debug("Initialize lists to track converted and unconverted items")
    converted_items = []
    unconverted_items = []

    if isinstance(input_value, list):
        logging.debug("If input_value is a list, convert each element")
        for item in input_value:
            converted_value = value_mapping.get(item, None)
            if converted_value is not None:
                converted_items.append(converted_value)
            else:
                converted_items.append(np.nan)
                unconverted_items.append(item)
    else:
        logging.debug("If input_value is a single value, convert it")
        converted_value = value_mapping.get(input_value, None)
        if converted_value is not None:
            converted_items.append(converted_value)
        else:
            converted_items.append(np.nan)
            unconverted_items.append(input_value)

    logging.debug("{len(converted_items)} converted: {converted_items}")
    logging.debug("{len(unconverted_items)} unconverted: {unconverted_items}")
    return converted_items

# Example usage:
value1_list = ['apple', 'banana', 'cherry']
value2_list = ['red', 'yellow', 'red']

# Convert a single value
conversion_result = convert_with_tracking('kiwi', value1_list, value2_list)
print(conversion_result)
# Convert a list of values
input_list = ['apple', 'banana', 'kiwi']
conversion_result = convert_with_tracking(input_list, value1_list, value2_list)
print(conversion_result)


[nan]
['red', 'yellow', nan]


In [22]:
import pandas as pd
import numpy as np
def drop_na_index_rows(df):
    """
    Drop rows with missing (NaN) index values from a pandas DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame from which to drop rows with missing index values.
    
    Returns:
    pd.DataFrame: A new DataFrame with rows containing missing index values removed.
    """
    # Use boolean indexing to drop rows with NaN index values
    cleaned_df = df[~df.index.isna()]
    
    return cleaned_df

# Example usage:
data = {'A': [1, 2, 3, 4, 5]}
index_values = [np.nan, 'row2', 'row3', None, 'row5']
df = pd.DataFrame(data, index=index_values)

# Drop rows with missing index values from the DataFrame
cleaned_df = drop_na_index_rows(df)

print("Original DataFrame:")
print(df)

print("\nDataFrame after dropping rows with missing index values:")
print(cleaned_df)


Original DataFrame:
      A
NaN   1
row2  2
row3  3
None  4
row5  5

DataFrame after dropping rows with missing index values:
      A
row2  2
row3  3
row5  5


In [26]:
type(df3['A'].value_counts())

pandas.core.series.Series

In [27]:
type(df3.A.value_counts())

pandas.core.series.Series

In [29]:
df3.set_index('A').loc[:, ['C']]

Unnamed: 0_level_0,C
A,Unnamed: 1_level_1
5,700
6,800
7,900
8,1000


In [40]:
df3.reset_index().set_index('A')[['C']]

Unnamed: 0_level_0,C
A,Unnamed: 1_level_1
5,700
6,800
7,900
8,1000


In [37]:
df3.reset_index().set_index('A')['C']

A
5     700
6     800
7     900
8    1000
Name: C, dtype: int64

In [66]:
def describe_df(*datasets, n=5):
    """
    This function is to assist in debugging, returning key facts about dataframe(s) concisely.
    """
    for df in datasets:
        print(f"-----\nShape {df.shape} \nFirst {n} cols: {df.columns[:n]} \nFirst {n} indices: {df.index[:n]} \n-----")
    return

In [67]:
describe_df(df2)

-----
Shape (3, 2) 
First 5 cols: Index(['X', 'Y'], dtype='object') 
First 5 indices: Index(['row4', 'row5', 'row6'], dtype='object') 
-----


In [61]:
import pandas as pd

def report_df_info(*dataframes, n=5):
    """
    Report information about an arbitrary number of dataframes.

    Parameters:
    *dataframes (pd.DataFrame): Arbitrary number of dataframes to report information about.
    n (int): Number of columns and indices to display.

    Returns:
    None
    """
    for idx, df in enumerate(dataframes, start=1):
        print(f"----- DataFrame {idx} Info -----")
        print(f"Shape: {df.shape}")
        print(f"First {n} columns: {df.columns[:n].tolist()}")
        print(f"First {n} indices: {df.index[:n].tolist()}")
        print("-----")

# Example usage:
data1 = {'A': [1, 2, 3], 'B': [4, 5, 6]}
data2 = {'X': [7, 8, 9], 'Y': [10, 11, 12]}

df1 = pd.DataFrame(data1, index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame(data2, index=['row4', 'row5', 'row6'])

# Call the function to report information about the dataframes
report_df_info(df1, df2)


----- DataFrame 1 Info -----
Shape: (3, 2)
First 5 columns: ['A', 'B']
First 5 indices: ['row1', 'row2', 'row3']
-----
----- DataFrame 2 Info -----
Shape: (3, 2)
First 5 columns: ['X', 'Y']
First 5 indices: ['row4', 'row5', 'row6']
-----


In [73]:
import pandas as pd

def report_df_info_with_names(**dataframes):
    """
    Report information about an arbitrary number of dataframes with names using kwargs.

    Parameters:
    **dataframes (pd.DataFrame): Arbitrary number of dataframes with names as keyword arguments.

    Returns:
    None
    """
    for name, df in dataframes.items():
        print(f"----- DataFrame: {name} Info -----")
        print(f"Shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
        print(f"Indices: {df.index.tolist()}")
        print("-----")

# Example usage:
data1 = {'A': [1, 2, 3], 'B': [4, 5, 6]}
data2 = {'X': [7, 8, 9], 'Y': [10, 11, 12]}

df1 = pd.DataFrame(data1, index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame(data2, index=['row4', 'row5', 'row6'])

# Call the function with dataframe names using kwargs
report_df_info_with_names(df1=df1, DF2=df2)


----- DataFrame: df1 Info -----
Shape: (3, 2)
Columns: ['A', 'B']
Indices: ['row1', 'row2', 'row3']
-----
----- DataFrame: DF2 Info -----
Shape: (3, 2)
Columns: ['X', 'Y']
Indices: ['row4', 'row5', 'row6']
-----


In [81]:
import pandas as pd

def report_df_info_with_names(df_dict, n=5):
    """
    Report information about dataframes with names provided in a dictionary.

    Parameters:
    df_dict (dict): A dictionary where keys are names and values are dataframes.
    n (int): Number of columns and indices to display.

    Returns:
    None
    """
    for name, df in df_dict.items():
        print(f"----- DataFrame: {name} Info -----")
        print(f"Shape: {df.shape}")
        print(f"First {n} columns: {df.columns[:n].tolist()}")
        print(f"First {n} indices: {df.index[:n].tolist()}")
        print("-----")

# Example usage:
data1 = {'A': [1, 2, 3], 'B': [4, 5, 6]}
data2 = {'X': [7, 8, 9], 'Y': [10, 11, 12]}

df1 = pd.DataFrame(data1, index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame(data2, index=['row4', 'row5', 'row6'])

# Create a dictionary with dataframe names
dataframes_dict = {"DataFrame 1": df1, "DataFrame 2": df2}

# Call the function with the dictionary of dataframes
report_df_info_with_names(dataframes_dict)

# alternatively,
names = ['Dataframe 1', 'DF2']
dfs = [df1, df2]
report_df_info_with_names(dict(zip(names, dfs)))

----- DataFrame: DataFrame 1 Info -----
Shape: (3, 2)
First 5 columns: ['A', 'B']
First 5 indices: ['row1', 'row2', 'row3']
-----
----- DataFrame: DataFrame 2 Info -----
Shape: (3, 2)
First 5 columns: ['X', 'Y']
First 5 indices: ['row4', 'row5', 'row6']
-----
testing my setup
----- DataFrame: Dataframe 1 Info -----
Shape: (3, 2)
First 5 columns: ['A', 'B']
First 5 indices: ['row1', 'row2', 'row3']
-----
----- DataFrame: DF2 Info -----
Shape: (3, 2)
First 5 columns: ['X', 'Y']
First 5 indices: ['row4', 'row5', 'row6']
-----


In [76]:
list(zip(names, dfs))

[('test',
        A  B
  row1  1  4
  row2  2  5
  row3  3  6),
 ('b',
        X   Y
  row4  7  10
  row5  8  11
  row6  9  12)]

In [92]:
import pandas as pd
import logging

# Configure the logging module
logging.basicConfig(
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def report_df_info_with_names(*args, n=5):
    """
    Report information about dataframes with names provided as positional arguments.

    Parameters:
    *args (pd.DataFrame): Arbitrary number of dataframes.
    n (int): Number of columns and indices to display.

    Returns:
    str: Formatted information about dataframes.
    """
    info_list = []
    for idx, df in enumerate(args, start=1):
        name = f"DataFrame {idx}"
        info_list.append(f"----- DataFrame: {name} Info -----")
        info_list.append(f"Shape: {df.shape}")
        info_list.append(f"First {n} columns: {df.columns[:n].tolist()}")
        info_list.append(f"First {n} indices: {df.index[:n].tolist()}")
        info_list.append("-----")
    
    # Combine the information into a single string
    info_str = '\n'.join(info_list)
    return info_str

# Example usage:
data1 = {'A': [1, 2, 3], 'B': [4, 5, 6]}
data2 = {'X': [7, 8, 9], 'Y': [10, 11, 12]}

df1 = pd.DataFrame(data1, index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame(data2, index=['row4', 'row5', 'row6'])

# Log the information using logging.info()
logging.info(report_df_info_with_names(df1, df2))
logging.info("print")


INFO:root:----- DataFrame: DataFrame 1 Info -----
Shape: (3, 2)
First 5 columns: ['A', 'B']
First 5 indices: ['row1', 'row2', 'row3']
-----
----- DataFrame: DataFrame 2 Info -----
Shape: (3, 2)
First 5 columns: ['X', 'Y']
First 5 indices: ['row4', 'row5', 'row6']
-----
INFO:root:print
