In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objs as go
import numpy as np
import requests
import json
from collections import Counter
import scipy as sp
import os
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import nltk
import codecs
from sklearn import feature_extraction
import mpld3
from nltk.stem.snowball import SnowballStemmer
import re
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__)

In [None]:
movies = pd.read_csv("tmdb_5000_movies.csv")
movies =movies[movies['popularity']<400]
movies_sb1 = movies[['popularity', 'budget']]
#movies_sb1 = movies[['vote_average', 'budget']]
scaler = StandardScaler()
scaler.fit(movies_sb1)
movies_sb1_arr = scaler.transform(movies_sb1)
pop, bud = movies_sb1_arr.T
movies['std_popularity'] = pop.tolist()
#movies['std_va'] = pop.tolist()
movies['std_budget'] = bud.tolist()
dup_movies = movies.copy()

# NLP for Similar Movies

In [None]:
movies.overview.replace(np.NaN, 'No Description', inplace=True)

synopses = movies.overview.tolist()

overview = [word_tokenize(synopses[i]) for i in range(len(synopses))]
stopWords = set(stopwords.words('english'))
#print(overview)

for text in overview:
    for w in list(text):
        if w in list(stopWords) or len(w) == 1:
            text.remove(w)

#print(overview)

m = Word2Vec(overview, size=50, min_count=1 ,sg=1)
def vectorizer (sent, m):
    vec = []
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                vec = m[w]
            else:
    
                vec = np.add(vec, m[w])
            numw += 1
        except:
            pass
    
        return np.asarray(vec) / numw
l=[]
for i in overview:
    l.append(vectorizer(i,m))
X = ''    
X=np.array(l)   

l2 = [[] if l[i] is None else l[i].tolist() for i in range(len(l))]

length = max(map(len, l2))
y=np.array([xi+[0]*(length-len(xi)) for xi in l2])

km = KMeans(n_clusters=50, random_state=10).fit(y)


#for index,overview in enumerate(overview):
#    print(str(km.labels_[index]) + ':' + str(overview))


movies['overview_cluster'] = km.labels_

# Popularity vs Budget Clustering

In [None]:
default_clusters = 4
kmeans = KMeans(n_clusters=default_clusters, random_state=10).fit(movies[['std_popularity','std_budget']].values)
movies['predicted'] = kmeans.labels_

fig = px.scatter(movies, x="budget", y="popularity", color="predicted",
                 hover_data=['title'])
#fig.show()

# Recommendations

In [None]:
import textwrap
movies = movies.reset_index()
movies['split_overview'] = [textwrap.wrap(movies['overview'][i], 
                            width=30) for i in range(len(movies))]
movies['split_overview'] = ['<br>'.join(movies['split_overview'][i]) for i in range(len(movies))]
movies['split_overview'][0]

In [None]:
default_overview_cluster = 0

bar = px.bar(movies[movies['overview_cluster']==default_overview_cluster].nlargest(5,'popularity'),
             x='title', y='popularity', hover_name="split_overview")

bar.update_layout(
    hoverlabel=dict(font=dict(size=10))
    
)
#bar.show()

In [None]:
'''
movies['Time'] = pd.to_datetime(movies.release_date)
movies.Time = movies.Time.dt.year
movies['bud'] = movies['budget']

dim_list = ['runtime', 'Time', 'vote_average', 'revenue']

dimensions = [dict(values=movies[:100][label], label=label) for label in dim_list]
colorscale = [[0, 'gray'], [1, 'firebrick']]
color = np.zeros(len(movies[:100]), dtype='uint8')

parcat = go.Parcats(
        domain={'y': [0, 0.4]}, dimensions=dimensions,
        line={'colorscale': 'Electric',  'cmin': 0,
              'cmax': 2787965087, 'color': movies['revenue'], 'shape': 'hspline'})
parwidget = go.FigureWidget(
    data=[parcat
   ])
   '''
#parwidget

In [None]:
#TEST

movies['Time'] = pd.to_datetime(movies.release_date)
movies.Time = movies.Time.dt.year
movies['bud'] = movies['budget']
colorscale = [[0, 'gray'], [1, 'firebrick']]
color = np.zeros(len(movies[:5]), dtype='uint8')


movies_sb2 = movies[['runtime', 'revenue']]

scaler.fit(movies_sb2)
movies_sb2_arr = scaler.transform(movies_sb2)
run, rev = movies_sb2_arr.T
movies['std_runtime'] = run.tolist()
#movies['std_va'] = pop.tolist()
movies['std_revenue'] = rev.tolist()
movies['std_revenue'] = movies['std_revenue'].round(1)
movies['std_runtime'] = movies['std_runtime'].round(1)



dim_list = ['title', 'runtime', 'Time', 'revenue']
label = ['Title', 'Runtime', 'Release Year', 'Revenue']
dimensions = [dict(values=movies[:5][dim_list[i]], label=label[i]) for i in range(len(dim_list))]

'''
px.parallel_categories(movies[:100], dimensions=dim_list,
                color="predicted", color_continuous_scale=px.colors.sequential.Inferno,
                #labels={'sex':'Payer sex', 'smoker':'Smokers at the table', 'day':'Day of week'}
                #shape='hspline'      
                      )
'''

parcat = go.Parcats(
        domain={'y': [0, 0.4]}, dimensions=dimensions,
        line={'colorscale': 'Electric',  'cmin': 0,
              'cmax': 2787965087, 'color': movies['revenue'], 'shape': 'hspline'},
        #labels={'title':'Title', 'runtime':'Runtime', 'Time':'Release Year', 'revenue':'Revenue'},
        hoverinfo='none',)
parwidget = go.FigureWidget(
    data=[parcat
   ])

parwidget

In [None]:
#t=[0.333,0.55,1.6]
#int()

In [None]:
#this 
'''
sp2 = go.Scatter(x=movies.budget, y=movies.popularity,
    marker={'color': movies.predicted}, mode='markers', selected={'marker': {'color': 'firebrick'}},
    unselected={'marker': {'opacity': 0.3}})


#this
movies['Time'] = pd.to_datetime(movies.release_date)
movies.Time = movies.Time.dt.year

movies['bud'] = movies['budget']
#movies.Time = movies.Time.dt.year
# Build parcats dimensions
categorical_dimensions = ['language', 'vote_average', 'status']

#dimensions = [dict(values=movies[label], label=label) for label in categorical_dimensions]
# Create dimensions
Vote_dim = go.parcats.Dimension(
    values=movies[movies['Time'] > 2015].vote_average,
    categoryorder='category ascending',
    label="Vote")

Time_dim = go.parcats.Dimension(
    values=movies[movies['Time'] > 2015].Time,
    categoryorder='category ascending',
    label="Year")
Budget_dim = go.parcats.Dimension(
  values=movies[movies['Time'] > 2015].bud,
  label="budget",
  categoryorder='category ascending')

dimensions = list([
            dict(range = [0,10],
                 label = "Votes", values = movies['vote_average']),
            dict(range = [1916,2017],
                 label = 'Year', values = movies['Time']),
            #dict(tickvals = [0,0.5,1,2,3],
             #    ticktext = ['A','AB','B','Y','Z'],
              #   label = 'Cyclinder Material', values = df['cycMaterial']),
            dict(range = [0,380000000],
                 label = 'Budget', values = movies['budget'])])
   
# Build colorscale
color = np.zeros(len(movies), dtype='uint8')
colorscale = [[0, 'gray'], [1, 'firebrick']]


pc_fig = go.Parcats(
        domain={'y': [0, 0.4]}, dimensions=[Vote_dim, Time_dim, Budget_dim],
        line={'colorscale': colorscale, 'cmin': 0, 'cmax': 1, 'color': color, 'shape': 'hspline'})


pcsp_fig = go.FigureWidget(
    data=[sp2, pc_fig
    ])



pcsp_fig.update_layout(
        xaxis={'title': 'Budget'},
        yaxis={'title': 'Popularity', 'domain': [0.6, 1]},
        dragmode='lasso', hovermode='closest')




# Update color callback
def update_color(trace, points, state):
    # Update scatter selection
    pcsp_fig.data[0].selectedpoints = points.point_inds

    # Update parcats colors
    new_color = np.zeros(len(movies[movies['Time']>2015]), dtype='uint8')
    new_color[points.point_inds] = 1
    pcsp_fig.data[1].line.color = new_color

# Register callback on scatter selection...
pcsp_fig.data[0].on_selection(update_color)
# and parcats click
pcsp_fig.data[1].on_click(update_color)
'''
#pcsp_fig

# Dashboard Layout

In [None]:
#this
app.layout = html.Div(children=[
    html.H1(children='Movie Clusters (Click for Recommendations, Lasso for Details)'),
   
    dcc.Slider(
        id='cluster-slider',
        min= 1,
        max= 8,
        value=default_clusters,
        marks={str(num): str(num) for num in range(9)},
        step=None
     ),

    dcc.Graph(
        id='scatter',
        figure=fig
    ),
   
    dcc.Graph(
        id='bar',
        figure=bar
    ),
    html.H1(children='Movie Details'),
    dcc.Graph(
        id='par',
        figure=parwidget
    )
])



In [None]:
'''
app.layout = html.Div(children=[
    html.H1(children='Movie Clusters (Click for Recommendations)'),

    dcc.Graph(
        id='scatter',
        figure=fig
    ),
    dcc.Slider(
        id='cluster-slider',
        min= 1,
        max= 8,
        value=default_clusters,
        marks={str(num): str(num) for num in range(9)},
        step=None
     ),

    dcc.Graph(
        id='bar',
        figure=bar
    )
])

'''

In [None]:
@app.callback(
    dash.dependencies.Output('scatter','figure'),
    [dash.dependencies.Input('cluster-slider', 'value')]
)
def update_scatter(input_value):
    app.logger.info(input_value)
    new_kmeans = KMeans(n_clusters=input_value, random_state=10).fit(movies[['std_popularity','std_budget']].values)
    data = dup_movies.copy()
    data['predicted'] = new_kmeans.labels_
    #print(input_value)
    #print(np.array(new_kmeans.labels_).max())
    #print(data['predicted'].values.max())
    fig = px.scatter(data, x="budget", y="popularity", color="predicted",
                     hover_data=['title'])
    
    #fig.show()
    return fig

In [None]:
#TEST 
'''
@app.callback(
    dash.dependencies.Output('scatter','figure'),
    [dash.dependencies.Input('cluster-slider', 'value')]
)
def update_scatter(input_value):
    app.logger.info(input_value)
    new_kmeans = KMeans(n_clusters=input_value, random_state=10).fit(movies[['std_popularity','std_budget']].values)
    data = dup_movies.copy()
    data['predicted'] = new_kmeans.labels_
    #print(input_value)
    #print(np.array(new_kmeans.labels_).max())
    #print(data['predicted'].values.max())
    sp2 = go.Scatter(x=data.budget, y=data.popularity,
             marker={'color': data.predicted}, mode='markers', selected={'marker': {'color': 'firebrick'}},
             unselected={'marker': {'opacity': 0.3}})
    
    pcsp_fig = go.FigureWidget(
    data=[sp2
    ])

    return pcsp_fig
    '''

In [None]:
@app.callback(Output("bar", "figure"), [Input("scatter", "clickData")])
def event_cb(data):
    print(data)
    print(data['points'][0]['customdata'][0])
    temp1 = movies[movies['title']==data['points'][0]['customdata'][0]]
    temp1=temp1.reset_index()
    temp1.head()
    print(temp1['overview_cluster'][0])
    temp2 = movies[(movies['overview_cluster']==temp1['overview_cluster'][0]) 
                   & (movies['title']!=data['points'][0]['customdata'][0])].nlargest(5,'popularity')
    temp2 = temp2.reset_index()
    temp2.head()
    bar = px.bar(temp2,
             x='title', y='popularity', hover_name="split_overview")

    bar.update_layout(
        title="If You Liked " + str(data['points'][0]['customdata'][0]) + ", You Should Also Watch :",
        hoverlabel=dict(font=dict(size=10)))
    return bar

In [None]:
#TEST
@app.callback(
    dash.dependencies.Output('par','figure'),
    [dash.dependencies.Input('scatter', 'selectedData')]
)
def update_scatter(input_value):
    print(input_value)
    movie_list = [input_value['points'][i]['customdata'][0] for i in range(len(input_value['points']))]
    par_df = movies[movies['title'].isin(movie_list)]
    dimensions = [dict(values=par_df[dim_list[i]], label=label[i]) for i in range(len(dim_list))]
    colorscale = [[0, 'gray'], [1, 'firebrick']]
    color = np.zeros(len(par_df), dtype='uint8')
    parcat = go.Parcats(
            domain={'y': [0, 0.4]}, dimensions=dimensions,
            line={'colorscale': 'Electric',  'cmin': 0,
              'cmax': 2787965087, 'color': movies['revenue'], 'shape': 'hspline'},
            #labels={'title':'Title', 'runtime':'Runtime', 'Time':'Release Year', 'revenue':'Revenue'},
            hoverinfo='none')
    parwidget = go.FigureWidget(
        data=[parcat
       ])
    parwidget
    return parwidget    

In [None]:
#@app.callback(Output('par','figure'), [Input("bar", "clickData")])
#def bar_click(data):
#    print(data)

In [None]:
#TEST 
'''
@app.callback(Output("bar", "figure"), [Input("scatter", "clickData")])
def event_cb(data):
    print(data['points'][0]['customdata'][0])
    temp1 = movies[movies['title']==data['points'][0]['customdata'][0]]
    temp1=temp1.reset_index()
    temp1.head()
    print(temp1['overview_cluster'][0])
    temp2 = movies[(movies['overview_cluster']==temp1['overview_cluster'][0]) 
                   & (movies['title']!=data['points'][0]['customdata'][0])].nlargest(5,'popularity')
    temp2 = temp2.reset_index()
    temp2.head()
    bar = px.bar(temp2,
             x='title', y='popularity', hover_name="split_overview")

    bar.update_layout(
        title="If You Liked " + str(data['points'][0]['customdata'][0]) + ", You Should Also Watch :",
        hoverlabel=dict(font=dict(size=10)))
    return bar
    '''

In [None]:
app.run_server(debug=False)