# this notebook analyzes my bank account

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from datetime import date, timedelta
import calendar
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

## load the data and preprocess it

In [None]:
#load data
dtypes = {'date':'str','valeur':'str','nature_operation':'str','debit':'float32','credit':'float32'}
data = pd.read_csv('final_data.csv', dtype = dtypes)
data.drop(data.columns[0], axis = 1, inplace = True)

In [None]:
#change dtype
data.date = pd.to_datetime(data.date, dayfirst = True)
data.valeur = pd.to_datetime(data.valeur, dayfirst = True)

In [None]:
pd.set_option('display.max_rows', data.shape[0]+1)

In [None]:
data

In [None]:
data['dt'] = data.date.diff()

In [None]:
data.info()

In [None]:
data.dt = data.dt.dt.days

In [None]:
data = data.fillna(0)

In [None]:
start_date = data.date[0]
dates = {start_date:0}
for i in range(1,7):
    days_in_month = calendar.monthrange(start_date.year, start_date.month)[1]
    new_limit = start_date + timedelta(days=days_in_month)
    dates[new_limit] = i
    start_date = new_limit

In [None]:
def find_date(dates, date):
    month = 0
    for d_lim in dates.keys():
        if date > d_lim:
            month = dates[d_lim]
    return month

In [None]:
data['month'] = data.date.apply(lambda x: find_date(dates, x))

In [None]:
data

## add some interesting features

In [None]:
cumsum = data[['debit','credit','month']].groupby('month').cumsum()

In [None]:
cumsum.columns = ['cum_debit','cum_credit']

In [None]:
sns.lineplot(x = data.date, y = data.debit)

In [None]:
data = data.join(cumsum, how = 'inner')

In [None]:
sns.lineplot(x = data.date, y = data.cum_credit, color = 'g')
sns.lineplot(x = data.date, y = data.cum_debit, color = 'r')

In [None]:
#difference of cumulatives => rolling treasury
data['diff'] = data['cum_credit'] - data['cum_debit']

In [None]:
data

In [None]:
sns.lineplot(x = data.date, y = data['diff'])

In [None]:
# difference day to day
data['diff_d2d'] = data['credit'] - data['debit']

In [None]:
sns.lineplot(x = data.date, y = data['diff_d2d'])

In [None]:
data[['credit','debit','month']].groupby('month').sum()

## time to analyze "nature_operation" feature

In [None]:
data.nature_operation.values

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Remove words
    '''
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    return [stemmer.lemmatize(word) for word in nopunc]

In [None]:
text_process(data.nature_operation.values[0])

In [None]:
corpus = []
for text in data.nature_operation.values:
    corpus.append(text_process(text))

In [None]:
corpus

### feature engineering of the text data

In [None]:
vectorizer = TfidfVectorizer(analyzer = text_process,ngram_range = (1,3)).fit(data.nature_operation.values)

In [None]:
transformed = vectorizer.transform(data.nature_operation.values)

In [None]:
tf_idf = pd.DataFrame(data = transformed.toarray(), columns=vectorizer.get_feature_names())

In [None]:
final_df = tf_idf

print("{} rows".format(final_df.shape[0]))
final_df.T.nlargest(5, 0)

### do we get anything from this?

In [None]:
def run_KMeans(max_k, data):
    max_k += 1
    kmeans_results = dict()
    for k in range(2 , max_k):
        kmeans = KMeans(n_clusters = k
                               , init = 'k-means++'
                               , n_init = 10
                               , tol = 0.0001
                               , n_jobs = -1
                               , random_state = 1
                               , algorithm = 'full')

        kmeans_results.update( {k : kmeans.fit(data)} )
        
    return kmeans_results

In [None]:
# Running Kmeans
k = 10
kmeans_results = run_KMeans(k, final_df)

In [None]:
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # indices for each cluster
        x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        features = vectorizer.get_feature_names()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(df)
    return dfs

def plotWords(dfs, n_feats):
    plt.figure(figsize=(8, 4))
    for i in range(0, len(dfs)):
        plt.title(("Most Common Words in Cluster {}".format(i)), fontsize=10, fontweight='bold')
        sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[i][:n_feats])
        plt.show()

In [None]:
best_result = 10
kmeans = kmeans_results.get(best_result)

final_df_array = final_df.to_numpy()
prediction = kmeans.predict(final_df)
n_feats = 20
dfs = get_top_features_cluster(final_df_array, prediction, n_feats)
plotWords(dfs, 13)

In [None]:
from sklearn.cluster import KMeans
Sum_of_squared_distances = []
K = range(1,80, 3)
for k in K:
    km  = KMeans(n_clusters = k)
    km.fit(transformed)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
import matplotlib.pyplot as plt
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of squared distance')
plt.show()

In [None]:
svd = TruncatedSVD(n_components = 2, random_state = 0)
svd.fit(X = transformed)

In [None]:
svd.explained_variance_ratio_

In [None]:
reduced_features = svd.transform(transformed)

In [None]:
cls = MiniBatchKMeans(n_clusters = 10, random_state = 0)
cls.fit(transformed)
preds = cls.predict(transformed)
cls.labels_

In [None]:
reduced_cluster_c = svd.transform(cls.cluster_centers_) 

In [None]:
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(transformed))
plt.scatter(reduced_cluster_c[:, 0], reduced_cluster_c[:,1], marker='x', s=150, c='b')

### Use google search API to retrieve some additional information

In [None]:
pip install google

In [None]:
from googlesearch import search

In [None]:
query = data.nature_operation[1]

In [None]:
results = search(query, tld='com', lang='fr', num=5, start=0, stop=None, pause=2.0)