# This notebook is to Implement my solution for the home assignment
* Written by: Yotam Dery
* Submission date: 05/16/2024

## Imports

In [1]:
import numpy as np
import pandas as pd
import warnings
from statsmodels.tsa.seasonal import seasonal_decompose
from utils import aggregate_duplicates
from plot_utils import plot_feature_over_time, plot_combined_trends, plot_corr, box_plot, plot_seasonal_decomposition, plot_lag_plots

warnings.filterwarnings('ignore')

## EDA

### Reading the data file

In [2]:
# Reading and showing glimps of the data
df = pd.read_csv('lightricks_interview_dataset.csv', parse_dates=['month'])
df.tail(15)

Unnamed: 0,month,spend,revenue,subs
45,2024-01-01,78245.433186,228361.488025,42807.797696
46,2024-01-01,79194.84935,228361.488025,42807.797696
47,2024-02-01,72942.356713,,
48,2024-02-01,68365.845848,,
49,2024-03-01,84123.965592,,
50,2024-03-01,76110.29945,,
51,2024-04-01,78180.634086,,
52,2024-05-01,86814.089306,,
53,2024-06-01,87459.192145,,
54,2024-07-01,86602.082479,,


In [3]:
# As I noticed that there are rows which are duplicated by their month value - we handle it in this function
aggregated_df = aggregate_duplicates(df)

In [4]:
# Check for missing values
print(aggregated_df.isnull().sum())
print("\nrevenue and the number of subscribers is not known ahead of time. Hence, These null values are O.K.")

month       0
spend       0
revenue    11
subs       11
dtype: int64

revenue and the number of subscribers is not known ahead of time. Hence, These null values are O.K.


In [5]:
aggregated_df.describe()

Unnamed: 0,spend,revenue,subs
count,45.0,34.0,34.0
mean,76008.017483,189175.002446,43183.683014
std,15068.365802,58885.541838,11815.3839
min,53974.044782,102986.770996,21775.164505
25%,68682.142098,141739.270826,33865.645563
50%,73281.546183,177698.244333,43103.881024
75%,80117.132521,237638.505176,52467.379499
max,138713.913887,304535.260688,65055.14081


In [6]:
aggregated_df.dtypes

month      datetime64[ns]
spend             float64
revenue           float64
subs              float64
dtype: object

### Plot time series plots

In [7]:
for feature in aggregated_df.select_dtypes(include='number'):
    plot_feature_over_time(aggregated_df, feature)

In [8]:
# Create traces
plot_combined_trends(aggregated_df)

### Correlation Analysis

In [9]:
correlation_matrix = np.round(aggregated_df.corr(),2)
plot_corr(correlation_matrix)
print("As observed from the time series plots and confirmed by the correlation analysis,\nthe features revenue and subscriptions are highly correlated.")

As observed from the time series plots and confirmed by the correlation analysis,
the features revenue and subscriptions are highly correlated.


### Box Plot

In [10]:
for feature in aggregated_df.select_dtypes(include='number'):
    box_plot(aggregated_df, feature)
print("As we can see, there are no outliers according to the box plot.\nThe boxes are fairly even")

As we can see, there are no outliers according to the box plot.
The boxes are fairly even


### Seasonal Decomposition

* Seasonal decomposition is a technique used in time series analysis to decompose a time series into several components, each representing an underlying pattern.<br> It helps in understanding and analyzing the structure of the time series data by separating it into more interpretable elements.<br> The main components in seasonal decomposition are:<br>
1. Trend Component:<br> This captures the long-term progression of the time series data. It represents the overall direction (upward or downward) in the data over a long period.<br><br>

2. Seasonal Component:<br> This captures the repeating short-term cycle in the data. For example, monthly sales data may have a seasonal component that reflects higher sales in certain months each year. <br><br>

3. Residual (or Irregular) Component:<br> This captures the random noise or irregular fluctuations in the data that cannot be explained by the trend or seasonal components. It's essentially the remaining part of the time series after removing the trend and seasonal effects.<br><br>

When applying this method, we'll use an Additive Model, Assuming that the time series is composed of the sum of its components: <br>
`Y(t)=T(t)+S(t)+R(t)`


In [11]:
# Set 'month' as index
aggregated_df.dropna(inplace=True)
aggregated_df.set_index('month', inplace=True)

In [12]:
for feature in aggregated_df.select_dtypes(include='number'):
    # Perform seasonal decomposition
    result = seasonal_decompose(aggregated_df[feature], model='additive', period=12)
    plot_seasonal_decomposition(result, aggregated_df, feature)

### Lag plots

In [13]:
for feature in df.select_dtypes(include='number'):
    plot_lag_plots(aggregated_df, feature)


In [21]:
import pandas as pd
import plotly.graph_objs as go
# Define the window size for rolling statistics
window_size = 12

df = aggregated_df

# Calculate rolling mean and standard deviation
rolling_stats = pd.DataFrame()
for feature in df.select_dtypes(include='number'):
    rolling_stats[f'{feature}_mean'] = df[feature].rolling(window=window_size).mean()
    rolling_stats[f'{feature}_std'] = df[feature].rolling(window=window_size).std()

# Plotting rolling statistics for each feature using Plotly
for feature in ['revenue', 'subs', 'spend']:
    fig = go.Figure()
    
    # Add rolling mean to the plot
    fig.add_trace(go.Scatter(x=rolling_stats.index, y=rolling_stats[f'{feature}_mean'],
                             mode='lines', name=f'Rolling Mean - {feature.capitalize()}'))
    
    # Add rolling standard deviation to the plot
    fig.add_trace(go.Scatter(x=rolling_stats.index, y=rolling_stats[f'{feature}_std'],
                             mode='lines', name=f'Rolling Std Dev - {feature.capitalize()}'))
    
    # Update layout
    fig.update_layout(title=f'Rolling Mean and Standard Deviation of {feature.capitalize()}',
                      xaxis_title='Month',
                      yaxis_title=feature.capitalize(),
                      height= 400,
                      width= 1400, 
                      legend=dict(x=0.01, y=0.99, bgcolor='rgba(255, 255, 255, 0)', bordercolor='rgba(255, 255, 255, 0)'))
    
    # Show plot
    fig.show()