# Creating Mock Dataframe

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

features = {'Username':None, 'Age-range':None, 'Language':None, 'Fields_of_interest':None, 'Other':None}

mock_df = pd.DataFrame(columns=features)
mock_df

df = pd.read_csv(r'NationalNames.csv')
names = df.Name
usernames = names.apply(lambda x: x+'@gmail.com')

fields_of_interest = """Agriculture/Gardening
Architecture
Childcare
Cooking/Dining
Crafts
Current Affairs
Design/Graphic Arts
Education/Teaching
Engineering/Electronics
Environment/Natural Resources
Film
Finance/Investment
Fine, Visual, Performing Arts
Gaming
Government/Public Sector/Policy
Healthcare/Medicine
History
Hospitality/Tourism
Intelligence/Criminal Justice/Security
Mindfulness
Music
Petcare
Reading/Books
Retail/Shopping
Science
Sports
Tech
Travel
Volunteering
Writing/Publishing/Translation"""

fields_of_interest = fields_of_interest.split('\n')

np.random.choice(fields_of_interest, 5)

age_ranges = ['under 25', '26-35', '36-45', '46-55', '56-65', 'over 65']

language = ['English', 'Russian', 'Hebrew', 'Arabic', 'Spanish', 'French', 'German']

np.random.choice(language, 1)[0]

list_of_interests = ['Dance, Disco, Funk',
       'Folk music', 'Country', 'Classical', 'Musicals', 'Pop', 'Rock',
       'Metal, Hard rock', 'Punk', 'Hip hop, Rap', 'Reggae, Ska',
       'Swing, Jazz', 'Rock n Roll', 'Alternative music', 'Latin',
       'Techno, Trance', 'Opera', 'I really enjoy watching movies.',
       'Horror movies', 'Thriller movies', 'Comedies', 'Romantic movies',
       'Sci-fi movies', 'War movies', 'Tales', 'Cartoons',
       'Documentaries', 'Western movies', 'Action movies', 'History',
       'Psychology', 'Politics', 'Mathematics', 'Physics', 'Internet',
       'PC Software, Hardware', 'Economy, Management', 'Biology',
       'Chemistry', 'Poetry reading', 'Geography', 'Foreign languages',
       'Medicine', 'Law', 'Cars', 'Art', 'Religion', 'Outdoor activities',
       'Dancing', 'Playing musical instruments', 'Poetry writing',
       'Sport and leisure activities', 'Sport at competitive level',
       'Gardening', 'Celebrity lifestyle', 'Shopping',
       'Science and technology', 'Theatre', 'Socializing',
       'Adrenaline sports', 'Pets', 'Flying', 'Thunder, lightning',
       'Darkness', 'Heights', 'Spiders', 'Snakes', 'Rats, mice', 'Ageing',
       'Dangerous dogs', 'Public speaking', 'Smoking habits', 'Drinking',]
list_of_interests = [string.split(' ') for string in list_of_interests]

list_of_interests = [item for sublist in list_of_interests for item in sublist]
len(list_of_interests)

k=np.random.randint(2,8,1)
', '.join(np.random.choice(list_of_interests, k))

usernames.size

listed_interests, free_interests, ages, languages = [], [], [], []
for i in range(usernames.size):
    if i==50000:
        break
    k=np.random.randint(1,7,1)
    other_interests = ', '.join(np.random.choice(list_of_interests, k))
    free_interests.append(other_interests)
    age = np.random.choice(age_ranges, 1)[0]
    ages.append(age)
    lstd_interests = np.random.choice(fields_of_interest, 5)
    listed_interests.append(lstd_interests)
    languages.append(np.random.choice(language))

features['Username']=pd.Series(usernames.values[:50000])
features['Age-range']=pd.Series(ages)
features['Fields_of_interest']=pd.Series(listed_interests)
features['Other']=pd.Series(free_interests)
features['Language']=pd.Series(languages)

df_final = pd.concat([features['Username'], features['Age-range'], features['Language'], features['Fields_of_interest'], features['Other']], axis=1)

In [37]:
df_final.columns=features.keys()

In [38]:
df_final.head()

Unnamed: 0,Username,Age-range,Language,Fields_of_interest,Other
0,Mary@gmail.com,36-45,Spanish,"[Reading/Books, Childcare, Film, Crafts, Cooki...","activities, Techno,, n, languages, Flying"
1,Anna@gmail.com,over 65,Spanish,"[Childcare, Childcare, Petcare, Cooking/Dining...","watching, Darkness, reading, Horror, movies, rock"
2,Emma@gmail.com,under 25,Hebrew,"[Hospitality/Tourism, Hospitality/Tourism, Coo...","movies, Internet, Country, Poetry, Horror"
3,Elizabeth@gmail.com,56-65,Arabic,"[Fine, Visual, Performing Arts, Music, Crafts,...","habits, PC, Thriller"
4,Minnie@gmail.com,26-35,Arabic,"[Crafts, Engineering/Electronics, Environment/...","Pop, music"


# Preprocessing the Data

In [18]:
from collections import Counter

# LDA

In [22]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
stop_words = set(stopwords.words('english'))

In [23]:
vectorizer_lda = CountVectorizer(stop_words=stop_words)

In [43]:
documents = df_final.Other.tolist()

In [44]:
documents

['activities, Techno,, n, languages, Flying',
 'watching, Darkness, reading, Horror, movies, rock',
 'movies, Internet, Country, Poetry, Horror',
 'habits, PC, Thriller',
 'Pop, music',
 'leisure, Country, Western, Hip, activities',
 'instruments, Rock, really, sports, Sport',
 'Roll, Dancing, technology, enjoy, competitive, sports',
 'Foreign, n, music, Public, Medicine, really',
 'lightning, Tales, Science, and, movies',
 'Physics, Flying, movies, Poetry, Snakes, movies',
 'Action, Playing, History, Drinking, Opera',
 'PC, Tales, Swing,, Tales, movies., leisure',
 'Swing,',
 'writing',
 'Mathematics',
 'movies., War, Flying, Foreign',
 'Shopping, I, Action, Classical, Shopping',
 'Musicals, Ska',
 'Musicals, Rap, War',
 'Dangerous, Software,, Metal,, Alternative',
 'Heights, and, Classical, Theatre',
 'Socializing, Poetry, Chemistry',
 'Punk, Darkness, Trance',
 'activities, Dance,, Rats,, Celebrity, Gardening',
 'Rock, Dangerous, languages, Funk, Pop, I',
 'Gardening, Ageing, Chemis

In [45]:
X_lda = vectorizer_lda.fit_transform(documents)

In [46]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 90)

In [47]:
result = lda.fit_transform(X_lda)
result

array([[0.00222222, 0.00222222, 0.00222222, ..., 0.00222222, 0.00222222,
        0.00222222],
       [0.0015873 , 0.0015873 , 0.16249328, ..., 0.0015873 , 0.0015873 ,
        0.0015873 ],
       [0.00185185, 0.00185185, 0.20131103, ..., 0.00185185, 0.00185185,
        0.00185185],
       ...,
       [0.00277778, 0.00277778, 0.00277778, ..., 0.00277778, 0.00277778,
        0.00277778],
       [0.0015873 , 0.0015873 , 0.0015873 , ..., 0.0015873 , 0.0015873 ,
        0.0015873 ],
       [0.00277778, 0.00277778, 0.00277778, ..., 0.00277778, 0.00277778,
        0.25277778]])

In [48]:
result.shape

(50000, 90)

# K-means Model

In [None]:
from sklearn.cluster import KMeans

size_of_cluster = 100
num_of_clusters = int(len(df_no_name)/size_of_cluster)
kmeans = KMeans(n_clusters=num_of_clusters, random_state=0).fit(df_no_name)