In [1]:
%pip install factor_analyzer

In [1]:
# Import required libraries
import pandas as pd

from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt

In [2]:
import sys
# Load custom scripts in reusable_code folder
sys.path.append(r'/home/jupyter/reusable_code')

import google_api_functions as gaf

from google.cloud import bigquery

creds=gaf.Authenticate_Google(r'/home/jupyter/reusable_code/')

# Set up a BQ object and run a bit of code
bq = bigquery.Client(project='itv-bde-svod-prd',credentials=creds)
query="""
select distinct 
itv_ID,
programme_title, 
count(*) as Count
from `itv-bde-svod-prd.pes.svod_stream_summary`
where _PARTITIONTIME >= '2020-03-01' and _PARTITIONTIME <= '2020-03-31' and itv_ID is not null and programme_title is not null
and 
itv_ID in 
(select ITV_ID from (select distinct ITV_ID, count(distinct programme_title) as Count_2 
from `itv-bde-svod-prd.pes.svod_stream_summary`
where _PARTITIONTIME >= '2020-03-01' and _PARTITIONTIME <= '2020-03-31' and itv_ID is not null and programme_title is not null 
group by 1
having count_2 >=5))
group by 1,2
order by 1,2
"""
df = bq.query(query).to_dataframe()
df

In [139]:
help(gaf)

In [3]:
#transpose the data 
df.columns.name = None
df_t = df.pivot(index = 'itv_ID', columns = 'programme_title' , values = 'Count')



In [4]:
df_t2 = df_t.rename_axis(None, axis=1)
df_t2.head()

In [5]:
df_t3 = df_t2.rename_axis(None, axis=1).reset_index()
df_t3.head()

In [6]:
#drop first row of ID's
df_t4 = df_t3.drop(columns = 'itv_ID') 
df_t4.head()

In [7]:
import numpy as np
df_t5 = df_t4.replace(np.nan, 0)
df_t5.head()


In [23]:

corrm = df_t5.corr()
corrm


In [24]:
eigen_values = np.linalg.eigvals(corrm)
eigen_values_cumvar = (eigen_values/376).cumsum()
eigen_values_cumvar

In [8]:
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(df_t5)
chi_square_value, p_value

In [109]:
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(df_t5)

In [110]:
kmo_model

In [9]:
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(n_factors = 10, rotation = 'varimax')
fa.fit(df_t5)


In [21]:
loadings = fa.loadings_
rotated= fa.rotation_matrix_
communalities=fa.get_communalities():

In [15]:
col5 = df_t5.columns

In [25]:
# Get variance of each factors
... # 1. Sum of squared loadings (variance)
... # 2. Proportional variance
... # 3. Cumulative variance
summary = fa.get_factor_variance()
pd.DataFrame.from_records(summary)

In [144]:
# Putting the factors vs variables into a table format
pd.DataFrame.from_records(loadings, index = col5)
#look for sortings of factors 

In [128]:
# Scree PLot, Factors vs Eigenvalues
ev,v = fa.get_eigenvalues()
ev
xvals = range(1, df_t5.shape[1]+1)
plt.scatter(xvals,ev)
plt.grid
plt.show

In [136]:
fa.transform(df_t5)

In [None]:
communalities 

In [39]:
loadings_mask=[[1 if j==max(i) else 0 for j in i] for i in loadings]

max_loadings=loadings_mask*loadings
max_loadings

In [40]:
loadings_df=pd.DataFrame.from_records(max_loadings, index = col5)

In [53]:
loadings_df=pd.DataFrame.from_records(max_loadings, index = col5)
pd.options.display.max_rows = 999
loadings_df.sort_values(by=[i for i in loadings_df.columns],ascending=False)

In [59]:
array1=np.array([[1,2],[3,4]])
array1

In [58]:
array2=np.array([[0,1],[1,0]])
array2

In [60]:
array1*array2

In [64]:
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'red' if val < 0 else 'black'
    return 'color: {}s'.format(color)


In [72]:
def colorin(x):
    if x>0.8:
        formatting='background-color : yellow'
    elif x>0.3:
        formatting='background-color : green'
    else:
        formatting=''
    return formatting

loadings_df\
.sort_values(by=[i for i in loadings_df.columns],ascending=False)\
.style.applymap(lambda x:colorin(x))