# Set Up

In [1]:
# Import libraries
import altair as alt
import pandas as pd
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
from numpy import genfromtxt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. analysis Boston weather's data and shows how Boston's tempurture changed over time. 

In [2]:
#Ingest data from csv
df = pd.read_csv('boston-weather-mid.csv')

## First interactive visualization
#### The user can set the year range to see how the temperature changed in different year range. (interaction)

In [3]:
# Convert month names to numbers
month_to_num = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
    'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
    'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

num_to_month = {v: k for k, v in month_to_num.items()}  # Reverse mapping

# return colors based on the time range
def assign_color_range(year):
    base_year = 1872  # Start of the time range
    interval = 20  # Every 20 years
    index = (year - base_year) // interval 
    colors = ['#EAE4E2', '#DACFCA', '#CCBBB6', '#BFADA4', '#D18F71', '#DF672D', '#DF501B', '#DE0E0B'] #color of each range
    return colors[index % len(colors)] # giving different color for every 20 years

# Get the min and max year
year_range = (df['year'].min(), df['year'].max())

# Convert month to mun
df['month_num'] = df['month'].map(month_to_num)

# Create a new column to represent the 20-year time period a year belongs to
df['year_period'] = df['year'].apply(lambda y: f"{(y//20)*20}s to {(y//20)*20 + 19}s")

# create an interactive control
@interact(
    yearRange=widgets.IntRangeSlider(
        value=year_range,
        min=year_range[0],
        max=year_range[1],
        step=1,
        description='Year Range:',
        continuous_update=False
    )
)

# create an interactive chart
def timeline_chart(yearRange):
    # filter the year range
    filtered_df = df[(df['year'] >= yearRange[0]) & (df['year'] <= yearRange[1])]

    # Convert num to month name for better labeling
    filtered_df_copy = filtered_df.copy()
    filtered_df_copy['month_name'] = filtered_df_copy['month_num'].map(num_to_month)

    # Specify years and colors in the legend
    legend_periods = [f"{y}s to {y + 19}s" for y in range(1880, 2022, 20)]
    legend_colors = ['#EAE4E2', '#DACFCA', '#CCBBB6', '#BFADA4', '#D18F71', '#DF672D', '#DF501B', '#DE0E0B']
    
    # create an altir chart
    chart = alt.Chart(
        filtered_df_copy,
        title=alt.TitleParams(
            'How the Boston temperatures for each month have changed over time',
            subtitle=f"A comparison of {yearRange[0]} to {yearRange[1]}"
        )
    ).mark_line().encode(
        x=alt.X('month_name:O', axis=alt.Axis(title='Month'), sort=list(num_to_month.values())),
        y=alt.Y('meanTemp:Q', scale=alt.Scale(domain=[0, 70], nice=False), axis=alt.Axis(title='Mean Monthly Temp in °F')),
        color=alt.Color('year_period:N', 
                        scale=alt.Scale(domain=legend_periods, range=legend_colors), 
                        legend=alt.Legend(title='Year', symbolType='circle', symbolStrokeWidth=4)),
        detail='year:N', 
        tooltip=[alt.Tooltip('year'), alt.Tooltip('month_name:N'), alt.Tooltip('meanTemp:Q')]
    ).properties(
        width=450,
        height=500 # size of the chart
    )
    
    return chart

interactive(children=(IntRangeSlider(value=(1872, 2023), continuous_update=False, description='Year Range:', m…

## Second interactive visualization.
#### The user could choose which month they want to analyze.
#### Using Altair’s brush interaction, the user should be able to select a range on the x-axis and have the mean of the temperatures for that month plotted as a horizontal line.

In [4]:
brush = alt.selection_interval(encodings=['x'])  # Brushing on x-axis

# create an interactive control
@interact(
    month=widgets.Dropdown(
        options=df['month'].unique(),  # unique months from the data
        value='Aug', #start at august
        description='Month:',
    )
)

# create an interactive chart
def plot_chart(month):
    # Filter data based on the chosen month
    filtered_df = df[df['month'] == month]
    boston_weather_chart = alt.Chart(filtered_df, width=600, height=300) # the size of chart

    # Points Chart
    points = boston_weather_chart.mark_point().encode(
        x=alt.X('year:O', 
                axis=alt.Axis(values=[1860, 1880, 1900, 1920, 1940, 1960, 1980, 2000, 2020, 2040]),       
                title='Year'),
        y=alt.Y('meanTemp:Q', title='Mean Monthly Temp in °F'),
        color=alt.Color('year:Q', 
                        scale=alt.Scale(domain=[filtered_df['year'].min(), 
                                                filtered_df['year'].max()], 
                                        range=['gray','orange', 'red']), # set the color
                        title='Year'),  
        size=alt.condition(brush, alt.value(50), alt.value(50)),
    tooltip=['year', 'meanTemp', 'month']
 ).add_params(brush)

    # Mean Line                                      # the color and size of  the mean line
    meanLine = alt.Chart(filtered_df).mark_rule(color='#B80107', opacity=.5).encode(
        y=alt.Y('mean(meanTemp):Q', scale=alt.Scale(zero=False)),
        size=alt.SizeValue(3)
    ).transform_filter(
        brush  # Use the brush to filter data
    )

    return points + meanLine

interactive(children=(Dropdown(description='Month:', index=7, options=('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun…

# 2. if a student gets a job right out of college. You can interpret the 7 dimensions as
1) Fitness level (between 1 to 10)
2) College GPA (between 0 - 4.0)
3) Height (in inches)
4) Having a large social network (between 1 to 10)
5) Being in a serious relationship (between 1 to 10)
6) Graduate in good economic condition (between 1 to 10 with 10 as amazing economy)
7) Number of internships
# By using these 7 factors to predict if a student will get a job right out of college.

In [5]:
#Ingest data from csv
X = genfromtxt('job_right_out_of_college_X.csv', delimiter=',')
y = genfromtxt('job_right_out_of_college_y.csv', delimiter=',')

In [6]:
# split to Train/Test/Validation
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)

In [7]:
# preprocessing
scaler = MinMaxScaler(feature_range=(0, 1))

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
X_val = np.hstack((np.ones((X_val.shape[0], 1)), X_val))

In [8]:
θ = np.ones((X_train.shape[1], 1))
def sigmoid(x, θ):
    return 1 / (1 + np.exp(-np.dot(x, θ)))

def dL(X, y, θ, λ):
    m = X.shape[0]
    predictions = sigmoid(X, θ)
    error = predictions - y.reshape(-1, 1) 
    reg_term = λ * np.sign(θ)
    return (1 / m) * np.dot(X.T, error) + reg_term

def gradient_descent(X, y, θ, yt, λ, iterations=5000):
    for i in range(iterations):
        θ -= yt * dL(X, y, θ, λ)
    return θ

λ=0
yt=0.2
θ = gradient_descent(X_train, y_train, θ, yt, λ)

predictions_train = sigmoid(X_train, θ)
predictions_test = sigmoid(X_test, θ)
predictions_val = sigmoid(X_val, θ)

y_train_pred = [1 if p > 0.5 else 0 for p in predictions_train]
y_test_pred = [1 if p > 0.5 else 0 for p in predictions_test]
y_val_pred = [1 if p > 0.5 else 0 for p in predictions_val]

accurancy_test = accuracy_score(y_test, y_test_pred)
accurancy_val = accuracy_score(y_val, y_val_pred)
train_predictions = sigmoid(X_train, θ)
train_predictions = [1 if p > 0.5 else 0 for p in train_predictions]
accuracy = np.mean(train_predictions == y_train)
print(f"Accuracy on training data: {accuracy:.2f}")
print(f"Accuracy on test data: {accurancy_test:.2f}")
print(f"Accuracy on val data: {accurancy_val:.2f}")

Accuracy on training data: 0.98
Accuracy on test data: 0.98
Accuracy on val data: 0.98


In [9]:
# Assuming 'weights' is your model's weight array and 'feature_names' is a list of the feature names
feature_names = ['Fitness level', 'College GPA', 'Height', 'Social network', 
                 'Serious relationship', 'Economic condition', 'Number of internships']

# Assuming θ is your weight vector and is a numpy array
feature_importances = [(name, weight) for name, weight in zip(feature_names, θ.flatten())]
feature_importances.sort(key=lambda x: abs(x[1]), reverse=True)

# Sorted features by importance
for feature, importance in feature_importances:
    print(f"{feature}: {importance}")

# Identifying positively influencing factors
positive_influences = [feature for feature, weight in feature_importances if weight > 0]
print("Factors most positively influencing getting a job:", positive_influences)

Fitness level: -11.546900190332169
Number of internships: 9.40757408670768
Serious relationship: 4.305858917038395
Height: 2.793912793493618
Economic condition: -2.1664957071819515
Social network: -0.6282233786609756
College GPA: 0.15242512023482208
Factors most positively influencing getting a job: ['Number of internships', 'Serious relationship', 'Height', 'College GPA']


### Which factors most positively influence getting a job?

according to the features list that I got, the most positively influence one is Number of internships. cause the influence is the highest one.

In [10]:
# Using LogisticRegression with L1 penalty (lasso) and a liblinear solver which supports L1 penalties.
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42)
logreg_l1.fit(X_train, y_train)

y_pred_sklearn_test = logreg_l1.predict(X_test)
y_pred_sklearn_train = logreg_l1.predict(X_train)
y_pred_sklearn_val = logreg_l1.predict(X_val)
accuracy_sklearn_test = accuracy_score(y_test, y_pred_sklearn_test)
accuracy_sklearn_train = accuracy_score(y_train, y_pred_sklearn_train)
accuracy_sklearn_val = accuracy_score(y_val, y_pred_sklearn_val)

print("the accuracy of test data is", accuracy_sklearn_test)
print("the accuracy of train data is", accuracy_sklearn_train)
print("the accuracy of validation data is", accuracy_sklearn_val)

the accuracy of test data is 1.0
the accuracy of train data is 0.995
the accuracy of validation data is 0.99
