In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install google

Defaulting to user installation because normal site-packages is not writeable


# Class KnowledgeRepresentation

In [3]:
class KnowledgeRepresentation:
  def __init__(self):
    pass

  # Method to Generate Graphs of Frequencies of Entries in each Column
  def show_frequencies(self, df, target):
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    # Separate numeric and categorical columns
    numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Remove target column from appropriate list
    if target in numeric_columns:
        numeric_columns.remove(target)
    elif target in categorical_columns:
        categorical_columns.remove(target)

    # Remove boolean columns from categorical columns
    boolean_columns = [col for col in categorical_columns if df[col].nunique() == 2]
    categorical_columns = [col for col in categorical_columns if col not in boolean_columns]
    boolean_columns += [col for col in numeric_columns if df[col].nunique() == 2]
    numeric_columns = [col for col in numeric_columns if col not in boolean_columns]

    # Calculate layout parameters
    num_numeric = len(numeric_columns)
    num_categorical = len(categorical_columns)
    num_boolean = len(boolean_columns)

    num_plots = num_numeric + num_categorical + num_boolean + 1  # +1 for the target column

    # Calculate number of rows needed (three plots per row)
    num_rows = (num_plots + 2) // 2

    # Create figure and GridSpec layout
    fig = plt.figure(figsize=(20, 8 * num_rows), dpi=100)
    gs = gridspec.GridSpec(num_rows, 2)

    # Create axes for numeric plots
    axs_numeric = [fig.add_subplot(gs[i // 2, i % 2]) for i in range(num_numeric)]

    # Create axes for categorical plots
    axs_categorical = [fig.add_subplot(gs[(i + num_numeric) // 2, (i + num_numeric) % 2]) for i in range(num_categorical)]

    # Create axes for boolean plots
    axs_boolean = [fig.add_subplot(gs[(i + num_numeric + num_categorical) // 2, (i + num_numeric + num_categorical) % 2]) for i in range(num_boolean)]



    # Plot histograms for numeric columns
    if num_numeric > 0:
      for ax, col in zip(axs_numeric, numeric_columns):
          data = df[col].dropna()
          ax.hist(data, bins=30, edgecolor='black')
          # Add lines for mean and standard deviation
          mean = data.mean()
          std_dev = data.std()
          ax.axvline(mean, color='r', linestyle='--', linewidth=2, label='Mean')
          ax.axvline(mean + 3 * std_dev, color='g', linestyle='--', linewidth=2, label='Mean + 3 Std Dev')
          ax.axvline(mean - 3 * std_dev, color='g', linestyle='--', linewidth=2, label='Mean - 3 Std Dev')
          ax.legend()
          ax.set_title(f"Histogram of {col}")
          ax.set_xlabel(col)
          ax.set_ylabel("Frequency")
          # Annotate Numeric Columns section
          ax.annotate('Numeric Column', xy=(0.5, 1.2), xycoords='axes fraction', ha='center', va='center', fontsize=24, fontweight='bold')

    # Plot bar plots for categorical columns (excluding booleans and target)
    if num_categorical > 0:
      for ax, col in zip(axs_categorical, categorical_columns):
          df[col].value_counts().plot(kind='bar', ax=ax, color='aquamarine', edgecolor='black')
          ax.set_title(f"Bar Plot of {col}")
          ax.set_xlabel(col)
          ax.set_ylabel("Frequency")
          # Annotate Categorical Columns section
          ax.annotate('Categorical Column', xy=(0.5, 1.2), xycoords='axes fraction', ha='center', va='center', fontsize=24, fontweight='bold')

    # Plot pie charts for boolean columns
    if num_boolean > 0:
      for ax, col in zip(axs_boolean, boolean_columns):
          df[col].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, ax=ax)
          ax.set_title(f"Pie Chart of {col}")
          ax.set_ylabel('')  # Hide the y-label
          # Annotate Boolean Columns section
          ax.annotate('Boolean Column', xy=(0.5, 1.2), xycoords='axes fraction', ha='center', va='center', fontsize=24, fontweight='bold')

    # Create an axis for the target column
    remaining_plots = num_numeric + num_categorical + num_boolean
    if num_rows * 2 - remaining_plots > 0:
        ax_target = fig.add_subplot(gs[num_rows - 1, remaining_plots % 2])
    else:
        fig = plt.figure(figsize=(20, 8 * (num_rows + 1)), dpi=100)
        gs = gridspec.GridSpec(num_rows + 1, 2)

        # Redefine axes for all plots with the new layout
        axs_numeric = [fig.add_subplot(gs[i // 2, i % 2]) for i in range(num_numeric)]
        axs_categorical = [fig.add_subplot(gs[(i + num_numeric) // 2, (i + num_numeric) % 2]) for i in range(num_categorical)]
        axs_boolean = [fig.add_subplot(gs[(i + num_numeric + num_categorical) // 2, (i + num_numeric + num_categorical) % 2]) for i in range(num_boolean)]

        for ax, col in zip(axs_numeric, numeric_columns):
            data = df[col].dropna()
            ax.hist(data, bins=30, edgecolor='black')
            # Add lines for mean and standard deviation
            mean = data.mean()
            std_dev = data.std()
            ax.axvline(mean, color='r', linestyle='--', linewidth=2, label='Mean')
            ax.axvline(mean + 3 * std_dev, color='g', linestyle='--', linewidth=2, label='Mean + 3 Std Dev')
            ax.axvline(mean - 3 * std_dev, color='g', linestyle='--', linewidth=2, label='Mean - 3 Std Dev')
            ax.legend()
            ax.set_title(f"Histogram of {col}")
            ax.set_xlabel(col)
            ax.set_ylabel("Frequency")
            # Annotate Numeric Columns section
            ax.annotate('Numeric Column', xy=(0.5, 1.2), xycoords='axes fraction', ha='center', va='center', fontsize=24, fontweight='bold')

        for ax, col in zip(axs_categorical, categorical_columns):
            df[col].value_counts().plot(kind='bar', ax=ax, color='aquamarine', edgecolor='black')
            ax.set_title(f"Bar Plot of {col}")
            ax.set_xlabel(col)
            ax.set_ylabel("Frequency")
            # Annotate Categorical Columns section
            ax.annotate('Categorical Column', xy=(0.5, 1.2), xycoords='axes fraction', ha='center', va='center', fontsize=24, fontweight='bold')

        for ax, col in zip(axs_boolean, boolean_columns):
            df[col].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, ax=ax)
            ax.set_title(f"Pie Chart of {col}")
            ax.set_ylabel('')  # Hide the y-label
            # Annotate Boolean Columns section
            ax.annotate('Boolean Column', xy=(0.5, 1.2), xycoords='axes fraction', ha='center', va='center', fontsize=24, fontweight='bold')

        # Add axis for target column in the new layout
        ax_target = fig.add_subplot(gs[num_rows, 1])

    # Plot pie chart for the target column
    df[target].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, ax=ax_target)
    ax_target.set_title(f"Pie Chart of {target}")
    ax_target.set_ylabel('')  # Hide the y-label

    # Annotate Target Column section
    ax_target.annotate('Target Column', xy=(0.5, 1.2), xycoords='axes fraction', ha='center', va='center', fontsize=24, fontweight='bold')

    # Adjust layout and display
    fig.tight_layout()
    plt.subplots_adjust(hspace=0.8, wspace=0.2)
    font = {'family': 'sans-serif', 'color': 'black', 'weight': 'bold', 'size': 36}
    plt.figtext(0.5, 1.005, 'FREQUENCY GRAPHS', ha='center', va='center', fontdict=font)  # Centered title above all subplots
    plt.show()

  # Method to Generate Graphs of Distribution of Columns by Target
  def show_distribution(self, df, target):
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np

    # Separate numeric and categorical columns
    numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    if target in numeric_columns:
        numeric_columns.remove(target)
        target_is_numeric = True
    elif target in categorical_columns:
        categorical_columns.remove(target)
        target_is_numeric = False

    boolean_columns = [col for col in categorical_columns if df[col].nunique() == 2]
    categorical_columns = [col for col in categorical_columns if col not in boolean_columns]
    boolean_columns += [col for col in numeric_columns if df[col].nunique() == 2]
    numeric_columns = [col for col in numeric_columns if col not in boolean_columns]

    num_plots = len(numeric_columns) + len(categorical_columns) + len(boolean_columns)
    num_rows = (num_plots + 1) // 2  # Calculate number of rows needed (two plots per row)

    fig, axs = plt.subplots(num_rows, 2, figsize=(18, 6 * num_rows))

    idx = 0

    # Function to add a title above each category of graphs
    def add_category_title(ax, title):
        ax.text(0.5, 1.1, title, fontsize=24, fontweight='bold', ha='center', transform=ax.transAxes)

    for col in numeric_columns:
        row_idx = idx // 2
        col_idx = idx % 2

        if target_is_numeric:
            sns.scatterplot(x=target, y=col, data=df, ax=axs[row_idx, col_idx])
            axs[row_idx, col_idx].set_title(f'Scatter Plot of {col} by {target}')
        else:
            sns.stripplot(x=target, y=col, data=df, ax=axs[row_idx, col_idx])
            axs[row_idx, col_idx].set_title(f'Strip Plot of {col} by {target}')
        axs[row_idx, col_idx].set_xlabel(target)
        axs[row_idx, col_idx].set_ylabel(col)
        add_category_title(axs[row_idx, col_idx], 'Numeric Column')

        idx += 1

    for col in categorical_columns:
        row_idx = idx // 2
        col_idx = idx % 2

        if target_is_numeric:
            sns.stripplot(x=col, y=target, data=df, ax=axs[row_idx, col_idx])
            axs[row_idx, col_idx].set_title(f'Strip Plot of {target} by {col}')
        else:
            sns.countplot(x=col, hue=target, data=df, ax=axs[row_idx, col_idx])
            axs[row_idx, col_idx].set_title(f'Count Plot of {col} by {target}')
            plt.setp(axs[row_idx, col_idx].xaxis.get_majorticklabels(), rotation=90)
        axs[row_idx, col_idx].set_xlabel(col)
        add_category_title(axs[row_idx, col_idx], 'Categorical Column')

        idx += 1

    for col in boolean_columns:
        row_idx = idx // 2
        col_idx = idx % 2

        if target_is_numeric:
            sns.stripplot(x=col, y=target, data=df, ax=axs[row_idx, col_idx])
            axs[row_idx, col_idx].set_title(f'Strip Plot of {target} by {col}')
        else:
            counts = df.groupby([target, col]).size().unstack().fillna(0)
            counts.plot(kind='bar', stacked=True, ax=axs[row_idx, col_idx])
            axs[row_idx, col_idx].set_title(f'Stacked Bar Plot of {col} by {target}')
            axs[row_idx, col_idx].set_xlabel(target)
            axs[row_idx, col_idx].set_ylabel('Count')
            plt.setp(axs[row_idx, col_idx].xaxis.get_majorticklabels(), rotation=90)
        add_category_title(axs[row_idx, col_idx], 'Boolean Column')

        idx += 1

    # Hide unused subplots if there are an odd number of plots
    if num_plots % 2 != 0:
        fig.delaxes(axs.flatten()[-1])

    fig.tight_layout()
    plt.suptitle('Distribution of Columns by Target', fontsize=36, fontweight='bold', y=1.04)
    plt.show()

  # Method to Show Descriptive Statistics
  def show_describe_stats(self, df, target):

    import google.generativeai as genai
    import time

    genai.configure(api_key="AIzaSyAfe09bOemDufX_OyYz8qkL1tAPcpdn9WU")

    model = genai.GenerativeModel(
        model_name='gemini-1.5-pro'
    )


    desc = df.describe()

    prompt = f"""Please represent the following\
    statistics of a DataFrame in an elegant tableless descriptive\
    format with separate sections for each column\
    and without redundant sentences: {desc}. Everything must be\
    in plain text without markdowns."""

    response = model.generate_content(prompt)

    print(response.text)

    prompt = f'''{df[target].value_counts()} is count of\
    unique values in target column, comment on the skewness\
    of the dataset without using redundant sentences.'''

    response = model.generate_content(prompt)

    print(response.text)

In [4]:
from Data_Preprocessing import DataPreprocessing
from pyspark.sql import SparkSession

ModuleNotFoundError: No module named 'Data_Preprocessing'

In [4]:
dp = DataPreprocessing()
spark = SparkSession.builder.appName("Knowledge Representation").getOrCreate()

In [5]:
df = dp.readCsv("/content/adult.csv", spark)

In [6]:
df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income']

In [7]:
if 'id' in df.columns:
  df = df.drop('id')

In [None]:
df.show()

In [9]:
numeric_cols = []
categorical_cols = []

for col in df.columns:
  if df.select(col).dtypes[0][1] == 'string' or df.select(col).dtypes[0][1] == 'boolean':
    categorical_cols.append(col)
  else:
    numeric_cols.append(col)

In [10]:
df = dp.fillMissingValues(df, numeric_cols, categorical_cols)

In [None]:
df.show()

In [11]:
df_pandas = df.toPandas()

In [None]:
kr = KnowledgeRepresentation()
kr.show_frequencies(df_pandas, 'income')

In [None]:
kr = KnowledgeRepresentation()
kr.show_distribution(df_pandas, 'income')

In [147]:
kr = KnowledgeRepresentation()
kr.show_describe_stats(df_pandas, 'income')

Column: age
- Values range from 17 to 90 years old.
- The average age is 38.58 years old.
- 50% of the values are between 28 and 48 years old.

Column: fnlwgt
- Values range from 12285 to 1484705.
- The average value is 189778. 
- 50% of the values are between 117827 and 237051.

Column: education_num
- Values range from 1 to 16.
- The average value is 10.08.
- 50% of the values are between 9 and 12.

Column: capital_gain
- Values range from 0 to 99999.
- The average value is 1077.65.
- 75% of the values are 0.

Column: capital_loss
- Values range from 0 to 4356.
- The average value is 87.3.
- 75% of the values are 0.

Column: hours_per_week
- Values range from 1 to 99.
- The average value is 40.44.
- 50% of the values are between 40 and 45.

The dataset exhibits a significant class imbalance, with many more instances belonging to the "<=50K" class than the ">50K" class. 

