In [629]:
import re
import requests
import pandas as pd
import numpy as np
import urllib
import json
from bs4 import BeautifulSoup, NavigableString

#### Below is the wikipedia api call for a category search:

`http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max`

`action=query`: query the wikipedia api

`format=json`: return a json format

`list=categorymembers`: List of pages that belong to a given category, ordered by page sort title

`cmtitle=Category%3A+machine+learning`: title of category

`climit=max`: return up to the maximum amount of responses (500)

You may use this to get page titles from the wikipedia API. Things to watch out for:
* The responses contain categories
* You will want to fetch articles in those subcategories

The API's detailed documentation can be found [here](https://www.mediawiki.org/wiki/API:Main_page)

#### Make a function that formats a request for pages of a category

In [630]:
def generate_category(category):
    category = re.sub('\s', '_', category)
    return category


In [631]:
category = 'Category:Machine learning'

In [632]:
bs_cat = 'Category:Business software'

In [633]:
ml = generate_category(category)

In [634]:
ml

'Category:Machine_learning'

In [635]:
bs = generate_category(bs_cat)

In [636]:
bs

'Category:Business_software'

In [637]:
def cat_pages_depth(cat_name, max_depth=3):
    
    params = {'action':'query',
          'format':'json',
          'list':'categorymembers',
          'cmtitle': generate_category(cat_name),
          'cmlimit':'max',}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    
    members = data['query']['categorymembers']

    pages = list(filter(lambda x: x['ns'] == 0, members))
    subpages = list(filter(lambda x: x['ns'] == 14, members))
    
    while max_depth > 0:
    
        if not subpages:    
            return pages
        else:
            for subpage in subpages:
                max_depth -=1
                pages += cat_pages_depth(subpage['title'], max_depth)
            
    return pages

In [667]:
dummy = pd.DataFrame(cat_pages_depth(ml))

In [823]:
dummy_2 = pd.DataFrame(cat_pages_depth(bs))

In [642]:
dummy.shape

(1075, 3)

In [670]:
dummy['category'] = 'Machine learning'

In [671]:
dummy.head()

Unnamed: 0,ns,pageid,title,category
0,0,43385931,Data exploration,Machine learning
1,0,49082762,List of datasets for machine learning research,Machine learning
2,0,233488,Machine learning,Machine learning
3,0,53587467,Outline of machine learning,Machine learning
4,0,3771060,Accuracy paradox,Machine learning


In [824]:
dummy_2['category'] = 'Business software'

In [825]:
dummy_2.head()

Unnamed: 0,ns,pageid,title,category
0,0,1037763,Business software,Business software
1,0,41270069,AccuSystems,Business software
2,0,5211212,Active policy management,Business software
3,0,28502793,Alexandria (library software),Business software
4,0,44133735,Alteryx,Business software


In [674]:
def cleaner(message):
    message = re.sub('\.+', ' ', message)
    message = re.sub('[^a-z0-9 ]','', message.lower())
    message = re.sub('\d+','NUMBER ',message)
    message = re.sub('\s+',' ',message)
    return message

In [677]:
text_list = []
for _,row in dummy.iterrows():
    page = row['pageid']
    query = 'https://en.wikipedia.org/w/api.php?format=json&\
    action=query&prop=extracts&explaintext&pageids={}'.format(page)
    response = requests.get(query)
    page_info = response.json()
    page_text = page_info['query']['pages'][str(page)]['extract']
    page_text = cleaner(page_text)
    text_list.append(page_text)
    

In [827]:
bs_text_list = []
for _,row in dummy_2.iterrows():
    page = row['pageid']
    query = 'https://en.wikipedia.org/w/api.php?format=json&\
    action=query&prop=extracts&explaintext&pageids={}'.format(page)
    response = requests.get(query)
    page_info = response.json()
    page_text = page_info['query']['pages'][str(page)]['extract']
    page_text = cleaner(page_text)
    bs_text_list.append(page_text)

In [678]:
dummy['text'] = text_list

In [828]:
dummy_2['text'] = bs_text_list

In [681]:
dummy.head(3)

Unnamed: 0,ns,pageid,title,category,text
0,0,43385931,Data exploration,Machine learning,data exploration is an approach similar to initial data analysis whereby a data analyst uses visual exploration to understand what is in a dataset and the characteristics of the data rather than through traditional data management systems these characteristics can include size or amount of data completeness of the data correctness of the data possible relationships amongst data elements or filestables in the data data exploration is typically conducted using a combination of automated and manual activities automated activities can include data profiling or data visualization or tabular reports to give the analyst an initial view into the data and an understanding of key characteristics this is often followed by manual drilldown or filtering of the data to identify anomalies or patterns identified through the automated actions data exploration can also require manual scripting and queries into the data e g using languages such as sql or r or using excel or similar tools to view the raw data all of these activities are aimed at creating a clear mental model and understanding of the data in the mind of the analyst and defining basic metadata statistics structure relationships for the data set that can be used in further analysis once this initial understanding of the data is had the data can be pruned or refined by removing unusable parts of the data correcting poorly formatted elements and defining relevant relationships across datasets this process is also known as determining data quality at this stage the data can be considered ready for deeper analysis or be handed off to other analysts or users who have specific needs for the data data exploration can also refer to the adhoc querying and visualization of data to identify potential relationships or insights that may be hidden in the data in this scenario hypotheses may be created and then the data is explored to identify whether those hypotheses are correct traditionally this had been a key area of focus for statisticians with john tukey being a key evangelist in the field today data exploration is more widespread and is the focus of data analysts and data scientists the latter being a relatively new role within enterprises and larger organizations interactive data exploration this area of data exploration has become an area of interest in the field of machine learning this is a relatively new field and is still evolving as its most basic level a machinelearning algorithm can be fed a data set and can be used to identify whether a hypothesis is true based on the dataset common machine learning algorithms can focus on identifying specific patterns in the data common patterns include regression classification or clustering but there are many possible patterns and algorithms that can be applied to data via machine learning by employing machine learning it is possible to find patterns or relationships in the data that would be difficult or impossible to find via manual inspection trial and error or traditional exploration techniques software trifacta a data preparation and analysis platformpaxata selfservice data preparation softwarealteryx data blending and advanced data analytics softwareibm infosphere analyzer a data profiling toolmicrosoft power bi interactive visualization and data analysis toolopenrefine a standalone open source desktop application for data cleanup and data transformationtableau software interactive data visualization software see also exploratory data analysismachine learningdata profilingdata visualization references
1,0,49082762,List of datasets for machine learning research,Machine learning,these datasets are used for machinelearning research and have been cited in peerreviewed academic journals and other publications datasets are an integral part of the field of machine learning major advances in this field can result from advances in learning algorithms such as deep learning computer hardware and lessintuitively the availability of highquality training datasets highquality labeled training datasets for supervised and semisupervised machine learning algorithms are usually difficult and expensive to produce because of the large amount of time needed to label the data although they do not need to be labeled highquality datasets for unsupervised learning can also be difficult and costly to produce this list aggregates highquality datasets that have been shown to be of value to the machine learning research community from multiple different data repositories to provide greater coverage of the topic than is otherwise available image data datasets consisting primarily of images or videos for tasks such as object detection facial recognition and multilabel classification facial recognition in computer vision face images have been used extensively to develop facial recognition systems face detection and many other projects that use images of faces action recognition object detection and recognition handwriting and character recognition aerial images other images text data datasets consisting primarily of text for tasks such as natural language processing sentiment analysis translation and cluster analysis reviews news articles messages twitter and tweets other text sound data datasets of sounds and sound features music other sounds signal data datasets containing electric signal information requiring some sort of signal processing for further analysis electrical motiontracking other signals physical data datasets from physical systems highenergy physics systems astronomy earth science other physical biological data datasets from biological systems human animal plant microbe drug discovery anomaly data multivariate data datasets consisting of rows of observations and columns of attributes characterizing those observations typically used for regression analysis or classification but other types of algorithms can also be used this section includes datasets that do not fit in the above categories financial weather census transit internet games other multivariate see also comparison of deep learning software references
2,0,233488,Machine learning,Machine learning,machine learning is a field of computer science that gives computers the ability to learn without being explicitly programmed arthur samuel an american pioneer in the field of computer gaming and artificial intelligence coined the term machine learning in NUMBER while at ibm evolved from the study of pattern recognition and computational learning theory in artificial intelligence machine learning explores the study and construction of algorithms that can learn from and make predictions on data such algorithms overcome following strictly static program instructions by making datadriven predictions or decisions through building a model from sample inputs machine learning is employed in a range of computing tasks where designing and programming explicit algorithms with good performance is difficult or infeasible example applications include email filtering detection of network intruders or malicious insiders working towards a data breach optical character recognition ocr learning to rank and computer vision machine learning is closely related to and often overlaps with computational statistics which also focuses on predictionmaking through the use of computers it has strong ties to mathematical optimization which delivers methods theory and application domains to the field machine learning is sometimes conflated with data mining where the latter subfield focuses more on exploratory data analysis and is known as unsupervised learning machine learning can also be unsupervised and be used to learn and establish baseline behavioral profiles for various entities and then used to find meaningful anomalies within the field of data analytics machine learning is a method used to devise complex models and algorithms that lend themselves to prediction in commercial use this is known as predictive analytics these analytical models allow researchers data scientists engineers and analysts to produce reliable repeatable decisions and results and uncover hidden insights through learning from historical relationships and trends in the data according to the gartner hype cycle of NUMBER machine learning is at its peak of inflated expectations effective machine learning is difficult because finding patterns is hard and often not enough training data is available as a result machinelearning programs often fail to deliver overview tom m mitchell provided a widely quoted more formal definition of the algorithms studied in the machine learning field a computer program is said to learn from experience e with respect to some class of tasks t and performance measure p if its performance at tasks in t as measured by p improves with experience e this definition of the tasks in which machine learning is concerned offers a fundamentally operational definition rather than defining the field in cognitive terms this follows alan turings proposal in his paper computing machinery and intelligence in which the question can machines think is replaced with the question can machines do what we as thinking entities can do in turings proposal the various characteristics that could be possessed by a thinking machine and the various implications in constructing one are exposed types of problems and tasks machine learning tasks are typically classified into two broad categories depending on whether there is a learning signal or feedback available to a learning systemsupervised learning the computer is presented with example inputs and their desired outputs given by a teacher and the goal is to learn a general rule that maps inputs to outputs as special cases the input signal can be only partially available or restricted to special feedbacksemisupervised learning the computer is given only an incomplete training signal a training set with some often many of the target outputs missing active learning the computer can only obtain training labels for a limited set of instances based on a budget and also has to optimize its choice of objects to acquire labels for when used interactively these can be presented to the user for labeling reinforcement learning training data in form of rewards and punishments is given only as feedback to the programs actions in a dynamic environment such as driving a vehicle or playing a game against an opponent unsupervised learning no labels are given to the learning algorithm leaving it on its own to find structure in its input unsupervised learning can be a goal in itself discovering hidden patterns in data or a means towards an end feature learning among other categories of machine learning problems learning to learn learns its own inductive bias based on previous experience developmental learning elaborated for robot learning generates its own sequences also called curriculum of learning situations to cumulatively acquire repertoires of novel skills through autonomous selfexploration and social interaction with human teachers and using guidance mechanisms such as active learning maturation motor synergies and imitation another categorization of machine learning tasks arises when one considers the desired output of a machinelearned systemin classification inputs are divided into two or more classes and the learner must produce a model that assigns unseen inputs to one or more multilabel classification of these classes this is typically tackled in a supervised way spam filtering is an example of classification where the inputs are email or other messages and the classes are spam and not spam in regression also a supervised problem the outputs are continuous rather than discrete in clustering a set of inputs is to be divided into groups unlike in classification the groups are not known beforehand making this typically an unsupervised task density estimation finds the distribution of inputs in some space dimensionality reduction simplifies inputs by mapping them into a lowerdimensional space topic modeling is a related problem where a program is given a list of human language documents and is tasked to find out which documents cover similar topics history and relationships to other fields as a scientific endeavour machine learning grew out of the quest for artificial intelligence already in the early days of ai as an academic discipline some researchers were interested in having machines learn from data they attempted to approach the problem with various symbolic methods as well as what were then termed neural networks these were mostly perceptrons and other models that were later found to be reinventions of the generalized linear models of statistics probabilistic reasoning was also employed especially in automated medical diagnosis however an increasing emphasis on the logical knowledgebased approach caused a rift between ai and machine learning probabilistic systems were plagued by theoretical and practical problems of data acquisition and representation by NUMBER expert systems had come to dominate ai and statistics was out of favor work on symbolicknowledgebased learning did continue within ai leading to inductive logic programming but the more statistical line of research was now outside the field of ai proper in pattern recognition and information retrieval neural networks research had been abandoned by ai and computer science around the same time this line too was continued outside the aics field as connectionism by researchers from other disciplines including hopfield rumelhart and hinton their main success came in the midNUMBER s with the reinvention of backpropagation machine learning reorganized as a separate field started to flourish in the NUMBER s the field changed its goal from achieving artificial intelligence to tackling solvable problems of a practical nature it shifted focus away from the symbolic approaches it had inherited from ai and toward methods and models borrowed from statistics and probability theory it also benefited from the increasing availability of digitized information and the possibility to distribute that via the internet machine learning and data mining often employ the same methods and overlap significantly but while machine learning focuses on prediction based on known properties learned from the training data data mining focuses on the discovery of previously unknown properties in the data this is the analysis step of knowledge discovery in databases data mining uses many machine learning methods but with different goals on the other hand machine learning also employs data mining methods as unsupervised learning or as a preprocessing step to improve learner accuracy much of the confusion between these two research communities which do often have separate conferences and separate journals ecml pkdd being a major exception comes from the basic assumptions they work with in machine learning performance is usually evaluated with respect to the ability to reproduce known knowledge while in knowledge discovery and data mining kdd the key task is the discovery of previously unknown knowledge evaluated with respect to known knowledge an uninformed unsupervised method will easily be outperformed by other supervised methods while in a typical kdd task supervised methods cannot be used due to the unavailability of training data machine learning also has intimate ties to optimization many learning problems are formulated as minimization of some loss function on a training set of examples loss functions express the discrepancy between the predictions of the model being trained and the actual problem instances for example in classification one wants to assign a label to instances and models are trained to correctly predict the preassigned labels of a set of examples the difference between the two fields arises from the goal of generalization while optimization algorithms can minimize the loss on a training set machine learning is concerned with minimizing the loss on unseen samples relation to statistics machine learning and statistics are closely related fields according to michael i jordan the ideas of machine learning from methodological principles to theoretical tools have had a long prehistory in statistics he also suggested the term data science as a placeholder to call the overall field leo breiman distinguished two statistical modelling paradigms data model and algorithmic model wherein algorithmic model means more or less the machine learning algorithms like random forest some statisticians have adopted methods from machine learning leading to a combined field that they call statistical learning theory a core objective of a learner is to generalize from its experience generalization in this context is the ability of a learning machine to perform accurately on new unseen examplestasks after having experienced a learning data set the training examples come from some generally unknown probability distribution considered representative of the space of occurrences and the learner has to build a general model about this space that enables it to produce sufficiently accurate predictions in new cases the computational analysis of machine learning algorithms and their performance is a branch of theoretical computer science known as computational learning theory because training sets are finite and the future is uncertain learning theory usually does not yield guarantees of the performance of algorithms instead probabilistic bounds on the performance are quite common the biasvariance decomposition is one way to quantify generalization error for the best performance in the context of generalization the complexity of the hypothesis should match the complexity of the function underlying the data if the hypothesis is less complex than the function then the model has underfit the data if the complexity of the model is increased in response then the training error decreases but if the hypothesis is too complex then the model is subject to overfitting and generalization will be poorer in addition to performance bounds computational learning theorists study the time complexity and feasibility of learning in computational learning theory a computation is considered feasible if it can be done in polynomial time there are two kinds of time complexity results positive results show that a certain class of functions can be learned in polynomial time negative results show that certain classes cannot be learned in polynomial time approaches decision tree learning decision tree learning uses a decision tree as a predictive model which maps observations about an item to conclusions about the items target value association rule learning association rule learning is a method for discovering interesting relations between variables in large databases artificial neural networks an artificial neural network ann learning algorithm usually called neural network nn is a learning algorithm that is inspired by the structure and functional aspects of biological neural networks computations are structured in terms of an interconnected group of artificial neurons processing information using a connectionist approach to computation modern neural networks are nonlinear statistical data modeling tools they are usually used to model complex relationships between inputs and outputs to find patterns in data or to capture the statistical structure in an unknown joint probability distribution between observed variables deep learning falling hardware prices and the development of gpus for personal use in the last few years have contributed to the development of the concept of deep learning which consists of multiple hidden layers in an artificial neural network this approach tries to model the way the human brain processes light and sound into vision and hearing some successful applications of deep learning are computer vision and speech recognition inductive logic programming inductive logic programming ilp is an approach to rule learning using logic programming as a uniform representation for input examples background knowledge and hypotheses given an encoding of the known background knowledge and a set of examples represented as a logical database of facts an ilp system will derive a hypothesized logic program that entails all positive and no negative examples inductive programming is a related field that considers any kind of programming languages for representing hypotheses and not only logic programming such as functional programs support vector machines support vector machines svms are a set of related supervised learning methods used for classification and regression given a set of training examples each marked as belonging to one of two categories an svm training algorithm builds a model that predicts whether a new example falls into one category or the other clustering cluster analysis is the assignment of a set of observations into subsets called clusters so that observations within the same cluster are similar according to some predesignated criterion or criteria while observations drawn from different clusters are dissimilar different clustering techniques make different assumptions on the structure of the data often defined by some similarity metric and evaluated for example by internal compactness similarity between members of the same cluster and separation between different clusters other methods are based on estimated density and graph connectivity clustering is a method of unsupervised learning and a common technique for statistical data analysis bayesian networks a bayesian network belief network or directed acyclic graphical model is a probabilistic graphical model that represents a set of random variables and their conditional independencies via a directed acyclic graph dag for example a bayesian network could represent the probabilistic relationships between diseases and symptoms given symptoms the network can be used to compute the probabilities of the presence of various diseases efficient algorithms exist that perform inference and learning reinforcement learning reinforcement learning is concerned with how an agent ought to take actions in an environment so as to maximize some notion of longterm reward reinforcement learning algorithms attempt to find a policy that maps states of the world to the actions the agent ought to take in those states reinforcement learning differs from the supervised learning problem in that correct inputoutput pairs are never presented nor suboptimal actions explicitly corrected representation learning several learning algorithms mostly unsupervised learning algorithms aim at discovering better representations of the inputs provided during training classical examples include principal components analysis and cluster analysis representation learning algorithms often attempt to preserve the information in their input but transform it in a way that makes it useful often as a preprocessing step before performing classification or predictions allowing reconstruction of the inputs coming from the unknown data generating distribution while not being necessarily faithful for configurations that are implausible under that distribution manifold learning algorithms attempt to do so under the constraint that the learned representation is lowdimensional sparse coding algorithms attempt to do so under the constraint that the learned representation is sparse has many zeros multilinear subspace learning algorithms aim to learn lowdimensional representations directly from tensor representations for multidimensional data without reshaping them into highdimensional vectors deep learning algorithms discover multiple levels of representation or a hierarchy of features with higherlevel more abstract features defined in terms of or generating lowerlevel features it has been argued that an intelligent machine is one that learns a representation that disentangles the underlying factors of variation that explain the observed data similarity and metric learning in this problem the learning machine is given pairs of examples that are considered similar and pairs of less similar objects it then needs to learn a similarity function or a distance metric function that can predict if new objects are similar it is sometimes used in recommendation systems sparse dictionary learning in this method a datum is represented as a linear combination of basis functions and the coefficients are assumed to be sparse let x be a ddimensional datum d be a d by n matrix where each column of d represents a basis function r is the coefficient to represent x using d mathematically sparse dictionary learning means solving x d r displaystyle xapprox dr where r is sparse generally speaking n is assumed to be larger than d to allow the freedom for a sparse representation learning a dictionary along with sparse representations is strongly nphard and also difficult to solve approximately a popular heuristic method for sparse dictionary learning is ksvd sparse dictionary learning has been applied in several contexts in classification the problem is to determine which classes a previously unseen datum belongs to suppose a dictionary for each class has already been built then a new datum is associated with the class such that its best sparsely represented by the corresponding dictionary sparse dictionary learning has also been applied in image denoising the key idea is that a clean image patch can be sparsely represented by an image dictionary but the noise cannot genetic algorithms a genetic algorithm ga is a search heuristic that mimics the process of natural selection and uses methods such as mutation and crossover to generate new genotype in the hope of finding good solutions to a given problem in machine learning genetic algorithms found some uses in the NUMBER s and NUMBER s conversely machine learning techniques have been used to improve the performance of genetic and evolutionary algorithms rulebased machine learning rulebased machine learning is a general term for any machine learning method that identifies learns or evolves rules to store manipulate or apply knowledge the defining characteristic of a rulebased machine learner is the identification and utilization of a set of relational rules that collectively represent the knowledge captured by the system this is in contrast to other machine learners that commonly identify a singular model that can be universally applied to any instance in order to make a prediction rulebased machine learning approaches include learning classifier systems association rule learning and artificial immune systems learning classifier systems learning classifier systems lcs are a family of rulebased machine learning algorithms that combine a discovery component e g typically a genetic algorithm with a learning component performing either supervised learning reinforcement learning or unsupervised learning they seek to identify a set of contextdependent rules that collectively store and apply knowledge in a piecewise manner in order to make predictions applications applications for machine learning includein NUMBER the online movie company netflix held the first netflix prize competition to find a program to better predict user preferences and improve the accuracy on its existing cinematch movie recommendation algorithm by at least NUMBER a joint team made up of researchers from att labsresearch in collaboration with the teams big chaos and pragmatic theory built an ensemble model to win the grand prize in NUMBER for NUMBER million shortly after the prize was awarded netflix realized that viewers ratings were not the best indicators of their viewing patterns everything is a recommendation and they changed their recommendation engine accordingly in NUMBER the wall street journal wrote about the firm rebellion research and their use of machine learning to predict the financial crisis in NUMBER cofounder of sun microsystems vinod khosla predicted that NUMBER of medical doctors jobs would be lost in the next two decades to automated machine learning medical diagnostic software in NUMBER it has been reported that a machine learning algorithm has been applied in art history to study fine art paintings and that it may have revealed previously unrecognized influences between artists model assessments classification machine learning models can be validated by accuracy estimation techniques like the holdout method which splits the data in a training and test set conventionally NUMBER training set and NUMBER test set designation and evaluates the performance of the training model on the test set in comparison the nfoldcrossvalidation method randomly splits the data in k subsets where the kNUMBER instances of the data are used to train the model while the kth instance is used to test the predictive ability of the training model in addition to the holdout and crossvalidation methods bootstrap which samples n instances with replacement from the dataset can be used to assess model accuracy in addition to overall accuracy investigators frequently report sensitivity and specificity meaning true positive rate tpr and true negative rate tnr respectively similarly investigators sometimes report the false positive rate fpr as well as the false negative rate fnr however these rates are ratios that fail to reveal their numerators and denominators the total operating characteristic toc is an effective method to express a models diagnostic ability toc shows the numerators and denominators of the previously mentioned rates thus toc provides more information than the commonly used receiver operating characteristic roc and rocs associated area under the curve auc ethics machine learning poses a host of ethical questions systems which are trained on datasets collected with biases may exhibit these biases upon use algorithmic bias thus digitizing cultural prejudices for example using job hiring data from a firm with racist hiring policies may lead to a machine learning system duplicating the bias by scoring job applicants against similarity to previous successful applicants responsible collection of data and documentation of algorithmic rules used by a system thus is a critical part of machine learning because language contains biases machines trained on language corpora will necessarily also learn bias software software suites containing a variety of machine learning algorithms include the following free and opensource software proprietary software with free and opensource editions proprietary software journals journal of machine learning researchmachine learningneural computation conferences conference on neural information processing systemsinternational conference on machine learninginternational conference on learning representations see also references further reading external links international machine learning societypopular online course by andrew ng at coursera it uses gnu octave the course is a free version of stanford universitys actual course taught by ng whose lectures are also available for free mloss is an academic database of opensource machine learning software


In [680]:
dummy.shape

(1075, 5)

In [830]:
dummy_2.head(3)

Unnamed: 0,ns,pageid,title,category,text
0,0,1037763,Business software,Business software,business software or a business application is any software or set of computer programs used by business users to perform various business functions these business applications are used to increase productivity to measure productivity and to perform other business functions accurately by and large business software is likely to be developed to meet the needs of a specific business and therefore is not easily transferable to a different business environment unless its nature and operation is identical due to the unique requirements of each business offtheshelf software is unlikely to completely address a companys needs however where an ontheshelf solution is necessary due to time or monetary considerations some level of customization is likely to be required exceptions do exist depending on the business in question and thorough research is always required before committing to bespoke or offtheshelf solutions some business applications are interactive i e they have a graphical user interface or user interface and users can querymodifyinput data and view results instantaneously they can also run reports instantaneously some business applications run in batch mode they are set up to run based on a predetermined eventtime and a business user does not need to initiate them or monitor them some business applications are built inhouse and some are bought from vendors off the shelf software products these business applications are installed on either desktops or big servers prior to the introduction of cobol a universal compiler in NUMBER businesses developed their own unique machine language rcas language consisted of a NUMBER position instruction for example to read a record into memory the first two digits would be the instruction action code the next four positions of the instruction an a address would be the exact leftmost memory location where you want the readable character to be placed four positions a b address of the instruction would note the very rightmost memory location where you want the last character of the record to be located a two digit b address also allows a modification of any instruction instruction codes and memory designations excluded the use of NUMBER s or NUMBER s the first rca business application was implemented in NUMBER on a NUMBER k rca NUMBER the rca NUMBER mid frame NUMBER and large frame NUMBER began their marketing in early NUMBER many kinds of users are found within the business environment and can be categorized by using a small medium and large matrixthe small business market generally consists of home accounting software and office suites such as openoffice org or microsoft office the medium size or small and mediumsized enterprise sme has a broader range of software applications ranging from accounting groupware customer relationship management human resource management systems outsourcing relationship management loan origination software shopping cart software field service software and other productivity enhancing applications the last segment covers enterprise level software applications such as those in the fields of enterprise resource planning enterprise content management ecm business process management bpm and product lifecycle management these applications are extensive in scope and often come with modules that either add native functions or incorporate the functionality of thirdparty computer programs technologies that previously only existed in peertopeer software applications like kazaa and napster are starting to appear within business applications types of business tools enterprise application software easresource managementdigital dashboards also known as business intelligence dashboards enterprise dashboards or executive dashboards these are visually based summaries of business data that show ataglance understanding of conditions through metrics and key performance indicators kpis dashboards are a very popular tools that have arisen in the last few years online analytical processing olap which include holap rolap and molap are a capability of some management decision support and executive information systems that support interactive examination of large amounts of data from many perspectives reporting software generates aggregated views of data to keep the management informed about the state of their business procurement software is business software that helps to automate the purchasing function of organizations data mining is the extraction of consumer information from a database by utilizing software that can isolate and identify previously unknown patterns or trends in large amounts of data there is a variety of data mining techniques that reveal different types of patterns some of the techniques that belong here are statistical methods particularly business statistics and neural networks as very advanced means of analyzing data business performance management bpmdocument management software is made for organizing and managing multiple documents of various types some of them have storage functions for security and backup of valuable business information employee scheduling software used for creating and distributing employee schedules as well as for tracking employee hours brief history the essential motivation for business software is to increase profits by cutting costs or speeding the productive cycle in the earliest days of whitecollar business automation large mainframe computers were used to tackle the most tedious jobs like bank cheque clearing and factory accounting factory accounting software was among the most popular of early business software tools and included the automation of general ledgers fixed assets inventory ledgers cost accounting ledgers accounts receivable ledgers and accounts payable ledgers including payroll life insurance health insurance federal and state insurance and retirement the early use of software to replace manual whitecollar labor was extremely profitable and caused a radical shift in whitecollar labor one computer might easily replace NUMBER whitecollar pencil pushers and the computer would not require any health or retirement benefits building on these early successes with ibm hewlettpackard and other early suppliers of business software solutions corporate consumers demanded business software to replace the oldfashioned drafting board cadcam software or computeraided drafting for computeraided manufacturing arrived in the early NUMBER s also project management software was so valued in the early NUMBER s that it might cost as much as NUMBER per copy although such software typically had far fewer capabilities than modern project management software such as microsoft project which one might purchase today for under NUMBER per copy in the early days perhaps the most noticeable widespread change in business software was the word processor because of its rapid rise the ubiquitous ibm typewriter suddenly vanished in the NUMBER s as millions of companies worldwide shifted to the use of word perfect business software and later microsoft word software another vastly popular computer program for business were mathematical spreadsheet programs such as lotus NUMBER and later microsoft excel in the NUMBER s business shifted massively towards globalism with the appearance of sap software which coordinates a supplychain of vendors potentially worldwide for the most efficient streamlined operation of factory manufacture yet nothing in the history of business software has had the global impact of the internet with its email and websites that now serve commercial interests worldwide globalism in business fully arrived when the internet became a household word the next phase in the evolution of business software is being led by the emergance of robotic process automation rpa which involves identifying and automating highly repetitive tasks and processes with an aim to drive operational efficiency reduce costs and limit human error industries that have been in the forefront of rpa adoption include the insurance industry banking and financial services the legal industry and the healthcare industry application support business applications are built based on the requirements from the business users also these business applications are built to use certain kind of business transactions or data items these business applications run flawlessly until there are no new business requirements or there is no change in underlying business transactions also the business applications run flawlessly if there are no issues with computer hardware computer networks intenetintranet computer disks power supplies and various software components middleware database computer programs etc business applications can fail when an unexpected error occurs this error could occur due to a data error an unexpected data input or a wrong data input an environment error an in frastructure related error a programming error a human error or a work flow error when a business application fails one needs to fix the business application error as soon as possible so that the business users can resume their work this work of resolving business application errors is known as business application support reporting errors the business user calls the business application support team phone number or sends an email to the business application support team the business application support team gets all the details of the error from the business user on the phone or from the email these details are then entered in a tracking software the tracking software creates a request number and this request number is given to the business user this request number is used to track the progress on the support issue the request is assigned to a support team member notification of errors for critical business application errors such as an application not available or an application not working correctly an email is sent to the entire organization or impacted teams so that they are aware of the issue they are also provided with an estimated time for application availability investigation or analysis of application errors the business application support team member collects all the necessary information about the business software error this information is then recorded in the support request all of the data used by the business user is also used in the investigation the application program is reviewed for any possible programming errors error resolution if any similar business application errors occurred in the past then the issue resolution steps are retrieved from the support knowledge base and the error is resolved using those steps if it is a new support error then new issue resolution steps are created and the error is resolved the new support error resolution steps are recorded in the knowledge base for future use for major business application errors critical infrastructure or application failures a phone conference call is initiated and all required support personsteams join the call and they all work together to resolve the error code correction if the business application error occurred due to programming errors then a request is created for the application development team to correct programming errors if the business user needs new features or functions in the business application then the required analysisdesignprogrammingtestingrelease is planned and a new version of the business software is deployed business process correction if the business application error occurred due to a work flow issue or human errors during data input then the business users are notified business users then review their work flow and revise it if necessary they also modify the user guide or user instructions to avoid such an error in the future infrastructure issue correction if the business application error occurred due to infrastructure issues then the specific infrastructure team is notified the infrastructure team then implements permanent fixes for the issue and monitors the infrastructure to avoid the reoccurrence of the same error support follow up and internal reporting the business application error tracking system is used to review all issues periodically daily weekly and monthly and reports are generated to monitor the resolved issues repeating issues and pending issues reports are also generated for the itis management for improvement and management of business applications see also references external links
1,0,41270069,AccuSystems,Business software,accusystems llc is an american company headquartered in pueblo colorado that develops licenses supports and sells document imaging software and electronic document management primarily to the banking and finance industries over NUMBER banks currently use accusystems software in NUMBER accusystems made its first sale to peoples bank accusystems software known as accuaccount is mainly used to electronically scan store and manage loan files and any other associated paperwork in september NUMBER accusystems announced the acquisition of xtria rms financial institutions use the xtria rms software now known as tickler to track exceptions electronically in NUMBER accusystems won the banknews innovative solutions award for management software solutions in NUMBER banktech published a survey taken by accusystems that surveyed NUMBER community banks the survey found that only NUMBER of banks are using paperless systems for issuing loans the study is often used when discussing paperless systems in the uss banking industry references external links official website
2,0,5211212,Active policy management,Business software,active policy management is businessoriented enterprise software that provides an approach for efficiently and effectively addressing the many risks inherent in electronic communication with the exponential growth in the use of electronic communication many businesses are exposed to significant risks every day these risks range from noncompliance with various regulations to the leakage of intellectual property and to inappropriate or offensive employee behavior active policy management enables a business to accurately detect the violations to take the appropriate action even blocking the message from being sent and to quickly find and review the violation in order to address the situation preventing further damage there are many channels of electronic communication including email webbased email instant messaging messages sent from a bloomberg terminal mobile email sent from a handheld device such as a blackberry general use of a web browser ftp file copying e g memory sticks and many others electronic communication policy the key to effective detection of violations in electronic communication is policy policy for electronic communication defines who can send what to whom and if a violation is detected what action to take a policy is designed to address a specific issue or risk examples includecertain reports cannot be sent externally without a proper disclaimer being presentcertain employees cannot communicate about a business matter with other employeesdocuments intended for internal use only must not be sent to a recipient who is not a company employeepolicy can only be effective at identifying violations if it can understand the true intent of a message policies based only on a list of words or a lexicon generally cannot perform this task for any apm solution to be effective it must have a proven technology to define and deploy accurate policy and by proven an interested party should inquire as to a particular solutions successful installation at one or more customers application areas apm has three primary application areas realtime prevention intelligent review and smart tagging realtime prevention can detect violations in electronic communication before a message has been sent and before it has been delivered to an intended recipient by doing this a violation is prevented from having occurred and in the case where archive software is used a message that has not been sent will not be ingested by an archive or be retrievable at a later date intelligent review can detect violations in electronic communication after a message has been sent intelligent review also creates extremely targeted queues of messages that have a high likelihood of having violated an important corporate or regulatory policy a reviewer or supervisor can easily access these relevant messages in order to thoroughly audit them an audit can include flagging exporting approving rejecting and escalating a message smart tagging analyzes messages and assigns them to one or more categories this categorization can be used for selective message archiving to retain messages based on their content and to enhance message retrieval for investigative purposes industry relevance virtually all businesses use electronic communication and are exposed to the inherent risks therein certain businesses are exposed to more risks than others heavily regulated industries such as financial services have a very strong need for apm industries where companies have many of their intellectual property assets in digital form would benefit from protecting those assets with apm other industries that would benefit from using apm include those where companies are concerned with corporate behavior and governance and those that use archive software to store messages for long periods of time often for at least NUMBER years see also enterprise softwareregulatory complianceemailsecfinancial industry regulatory authoritynyseintellectual propertyarchive


In [831]:
dummy_2.shape

(1918, 5)

In [832]:
ml_df = dummy.copy()

In [833]:
bs_df = dummy_2.copy()

In [644]:
# def cleaner(whatever):
#     """
#     used in page_hunter function to clean urls
#     """
#     whatever = re.sub('\s', '_', whatever)
#     whatever = re.sub('\+', '%2B', whatever)
#     whatever = re.sub('\&', '%26', whatever)
#     return whatever

In [645]:
# def page_hunter(page):
#     url = """
#     http://en.wikipedia.org/w/api.php?action=query&\
#               format=json&prop=extracts&titles={}&rvprop=content""".format(cleaner(page))
#     url = re.sub('\s', '', url)
#     r = requests.get(url)
#     key = list(r.json()['query']['pages'].keys())[0]
#     return r.json()['query']['pages'][key]['extract']
    

In [646]:
# def get_pages(scraped_df):
#     """
#     maps titles to newly created text column in df
#     """
    
#     scraped_df['text'] = scraped_df['title'].map(lambda x: page_hunter(x))
    
#     return scraped_df
    
    
# #     for i, page in enumerate(scraped_df.title):
# #         page_to_df(page)
    
    

In [647]:
# ml_df_all = get_pages(ml_df)

In [692]:
ml_df_no_ns = dummy.drop(['ns'], axis=1)

In [834]:
bs_df_no_ns = dummy_2.drop(['ns'], axis=1)

In [693]:
ml_df_no_ns.head(4)

Unnamed: 0,pageid,title,category,text
0,43385931,Data exploration,Machine learning,data exploration is an approach similar to initial data analysis whereby a data analyst uses visual exploration to understand what is in a dataset and the characteristics of the data rather than through traditional data management systems these characteristics can include size or amount of data completeness of the data correctness of the data possible relationships amongst data elements or filestables in the data data exploration is typically conducted using a combination of automated and manual activities automated activities can include data profiling or data visualization or tabular reports to give the analyst an initial view into the data and an understanding of key characteristics this is often followed by manual drilldown or filtering of the data to identify anomalies or patterns identified through the automated actions data exploration can also require manual scripting and queries into the data e g using languages such as sql or r or using excel or similar tools to view the raw data all of these activities are aimed at creating a clear mental model and understanding of the data in the mind of the analyst and defining basic metadata statistics structure relationships for the data set that can be used in further analysis once this initial understanding of the data is had the data can be pruned or refined by removing unusable parts of the data correcting poorly formatted elements and defining relevant relationships across datasets this process is also known as determining data quality at this stage the data can be considered ready for deeper analysis or be handed off to other analysts or users who have specific needs for the data data exploration can also refer to the adhoc querying and visualization of data to identify potential relationships or insights that may be hidden in the data in this scenario hypotheses may be created and then the data is explored to identify whether those hypotheses are correct traditionally this had been a key area of focus for statisticians with john tukey being a key evangelist in the field today data exploration is more widespread and is the focus of data analysts and data scientists the latter being a relatively new role within enterprises and larger organizations interactive data exploration this area of data exploration has become an area of interest in the field of machine learning this is a relatively new field and is still evolving as its most basic level a machinelearning algorithm can be fed a data set and can be used to identify whether a hypothesis is true based on the dataset common machine learning algorithms can focus on identifying specific patterns in the data common patterns include regression classification or clustering but there are many possible patterns and algorithms that can be applied to data via machine learning by employing machine learning it is possible to find patterns or relationships in the data that would be difficult or impossible to find via manual inspection trial and error or traditional exploration techniques software trifacta a data preparation and analysis platformpaxata selfservice data preparation softwarealteryx data blending and advanced data analytics softwareibm infosphere analyzer a data profiling toolmicrosoft power bi interactive visualization and data analysis toolopenrefine a standalone open source desktop application for data cleanup and data transformationtableau software interactive data visualization software see also exploratory data analysismachine learningdata profilingdata visualization references
1,49082762,List of datasets for machine learning research,Machine learning,these datasets are used for machinelearning research and have been cited in peerreviewed academic journals and other publications datasets are an integral part of the field of machine learning major advances in this field can result from advances in learning algorithms such as deep learning computer hardware and lessintuitively the availability of highquality training datasets highquality labeled training datasets for supervised and semisupervised machine learning algorithms are usually difficult and expensive to produce because of the large amount of time needed to label the data although they do not need to be labeled highquality datasets for unsupervised learning can also be difficult and costly to produce this list aggregates highquality datasets that have been shown to be of value to the machine learning research community from multiple different data repositories to provide greater coverage of the topic than is otherwise available image data datasets consisting primarily of images or videos for tasks such as object detection facial recognition and multilabel classification facial recognition in computer vision face images have been used extensively to develop facial recognition systems face detection and many other projects that use images of faces action recognition object detection and recognition handwriting and character recognition aerial images other images text data datasets consisting primarily of text for tasks such as natural language processing sentiment analysis translation and cluster analysis reviews news articles messages twitter and tweets other text sound data datasets of sounds and sound features music other sounds signal data datasets containing electric signal information requiring some sort of signal processing for further analysis electrical motiontracking other signals physical data datasets from physical systems highenergy physics systems astronomy earth science other physical biological data datasets from biological systems human animal plant microbe drug discovery anomaly data multivariate data datasets consisting of rows of observations and columns of attributes characterizing those observations typically used for regression analysis or classification but other types of algorithms can also be used this section includes datasets that do not fit in the above categories financial weather census transit internet games other multivariate see also comparison of deep learning software references
2,233488,Machine learning,Machine learning,machine learning is a field of computer science that gives computers the ability to learn without being explicitly programmed arthur samuel an american pioneer in the field of computer gaming and artificial intelligence coined the term machine learning in NUMBER while at ibm evolved from the study of pattern recognition and computational learning theory in artificial intelligence machine learning explores the study and construction of algorithms that can learn from and make predictions on data such algorithms overcome following strictly static program instructions by making datadriven predictions or decisions through building a model from sample inputs machine learning is employed in a range of computing tasks where designing and programming explicit algorithms with good performance is difficult or infeasible example applications include email filtering detection of network intruders or malicious insiders working towards a data breach optical character recognition ocr learning to rank and computer vision machine learning is closely related to and often overlaps with computational statistics which also focuses on predictionmaking through the use of computers it has strong ties to mathematical optimization which delivers methods theory and application domains to the field machine learning is sometimes conflated with data mining where the latter subfield focuses more on exploratory data analysis and is known as unsupervised learning machine learning can also be unsupervised and be used to learn and establish baseline behavioral profiles for various entities and then used to find meaningful anomalies within the field of data analytics machine learning is a method used to devise complex models and algorithms that lend themselves to prediction in commercial use this is known as predictive analytics these analytical models allow researchers data scientists engineers and analysts to produce reliable repeatable decisions and results and uncover hidden insights through learning from historical relationships and trends in the data according to the gartner hype cycle of NUMBER machine learning is at its peak of inflated expectations effective machine learning is difficult because finding patterns is hard and often not enough training data is available as a result machinelearning programs often fail to deliver overview tom m mitchell provided a widely quoted more formal definition of the algorithms studied in the machine learning field a computer program is said to learn from experience e with respect to some class of tasks t and performance measure p if its performance at tasks in t as measured by p improves with experience e this definition of the tasks in which machine learning is concerned offers a fundamentally operational definition rather than defining the field in cognitive terms this follows alan turings proposal in his paper computing machinery and intelligence in which the question can machines think is replaced with the question can machines do what we as thinking entities can do in turings proposal the various characteristics that could be possessed by a thinking machine and the various implications in constructing one are exposed types of problems and tasks machine learning tasks are typically classified into two broad categories depending on whether there is a learning signal or feedback available to a learning systemsupervised learning the computer is presented with example inputs and their desired outputs given by a teacher and the goal is to learn a general rule that maps inputs to outputs as special cases the input signal can be only partially available or restricted to special feedbacksemisupervised learning the computer is given only an incomplete training signal a training set with some often many of the target outputs missing active learning the computer can only obtain training labels for a limited set of instances based on a budget and also has to optimize its choice of objects to acquire labels for when used interactively these can be presented to the user for labeling reinforcement learning training data in form of rewards and punishments is given only as feedback to the programs actions in a dynamic environment such as driving a vehicle or playing a game against an opponent unsupervised learning no labels are given to the learning algorithm leaving it on its own to find structure in its input unsupervised learning can be a goal in itself discovering hidden patterns in data or a means towards an end feature learning among other categories of machine learning problems learning to learn learns its own inductive bias based on previous experience developmental learning elaborated for robot learning generates its own sequences also called curriculum of learning situations to cumulatively acquire repertoires of novel skills through autonomous selfexploration and social interaction with human teachers and using guidance mechanisms such as active learning maturation motor synergies and imitation another categorization of machine learning tasks arises when one considers the desired output of a machinelearned systemin classification inputs are divided into two or more classes and the learner must produce a model that assigns unseen inputs to one or more multilabel classification of these classes this is typically tackled in a supervised way spam filtering is an example of classification where the inputs are email or other messages and the classes are spam and not spam in regression also a supervised problem the outputs are continuous rather than discrete in clustering a set of inputs is to be divided into groups unlike in classification the groups are not known beforehand making this typically an unsupervised task density estimation finds the distribution of inputs in some space dimensionality reduction simplifies inputs by mapping them into a lowerdimensional space topic modeling is a related problem where a program is given a list of human language documents and is tasked to find out which documents cover similar topics history and relationships to other fields as a scientific endeavour machine learning grew out of the quest for artificial intelligence already in the early days of ai as an academic discipline some researchers were interested in having machines learn from data they attempted to approach the problem with various symbolic methods as well as what were then termed neural networks these were mostly perceptrons and other models that were later found to be reinventions of the generalized linear models of statistics probabilistic reasoning was also employed especially in automated medical diagnosis however an increasing emphasis on the logical knowledgebased approach caused a rift between ai and machine learning probabilistic systems were plagued by theoretical and practical problems of data acquisition and representation by NUMBER expert systems had come to dominate ai and statistics was out of favor work on symbolicknowledgebased learning did continue within ai leading to inductive logic programming but the more statistical line of research was now outside the field of ai proper in pattern recognition and information retrieval neural networks research had been abandoned by ai and computer science around the same time this line too was continued outside the aics field as connectionism by researchers from other disciplines including hopfield rumelhart and hinton their main success came in the midNUMBER s with the reinvention of backpropagation machine learning reorganized as a separate field started to flourish in the NUMBER s the field changed its goal from achieving artificial intelligence to tackling solvable problems of a practical nature it shifted focus away from the symbolic approaches it had inherited from ai and toward methods and models borrowed from statistics and probability theory it also benefited from the increasing availability of digitized information and the possibility to distribute that via the internet machine learning and data mining often employ the same methods and overlap significantly but while machine learning focuses on prediction based on known properties learned from the training data data mining focuses on the discovery of previously unknown properties in the data this is the analysis step of knowledge discovery in databases data mining uses many machine learning methods but with different goals on the other hand machine learning also employs data mining methods as unsupervised learning or as a preprocessing step to improve learner accuracy much of the confusion between these two research communities which do often have separate conferences and separate journals ecml pkdd being a major exception comes from the basic assumptions they work with in machine learning performance is usually evaluated with respect to the ability to reproduce known knowledge while in knowledge discovery and data mining kdd the key task is the discovery of previously unknown knowledge evaluated with respect to known knowledge an uninformed unsupervised method will easily be outperformed by other supervised methods while in a typical kdd task supervised methods cannot be used due to the unavailability of training data machine learning also has intimate ties to optimization many learning problems are formulated as minimization of some loss function on a training set of examples loss functions express the discrepancy between the predictions of the model being trained and the actual problem instances for example in classification one wants to assign a label to instances and models are trained to correctly predict the preassigned labels of a set of examples the difference between the two fields arises from the goal of generalization while optimization algorithms can minimize the loss on a training set machine learning is concerned with minimizing the loss on unseen samples relation to statistics machine learning and statistics are closely related fields according to michael i jordan the ideas of machine learning from methodological principles to theoretical tools have had a long prehistory in statistics he also suggested the term data science as a placeholder to call the overall field leo breiman distinguished two statistical modelling paradigms data model and algorithmic model wherein algorithmic model means more or less the machine learning algorithms like random forest some statisticians have adopted methods from machine learning leading to a combined field that they call statistical learning theory a core objective of a learner is to generalize from its experience generalization in this context is the ability of a learning machine to perform accurately on new unseen examplestasks after having experienced a learning data set the training examples come from some generally unknown probability distribution considered representative of the space of occurrences and the learner has to build a general model about this space that enables it to produce sufficiently accurate predictions in new cases the computational analysis of machine learning algorithms and their performance is a branch of theoretical computer science known as computational learning theory because training sets are finite and the future is uncertain learning theory usually does not yield guarantees of the performance of algorithms instead probabilistic bounds on the performance are quite common the biasvariance decomposition is one way to quantify generalization error for the best performance in the context of generalization the complexity of the hypothesis should match the complexity of the function underlying the data if the hypothesis is less complex than the function then the model has underfit the data if the complexity of the model is increased in response then the training error decreases but if the hypothesis is too complex then the model is subject to overfitting and generalization will be poorer in addition to performance bounds computational learning theorists study the time complexity and feasibility of learning in computational learning theory a computation is considered feasible if it can be done in polynomial time there are two kinds of time complexity results positive results show that a certain class of functions can be learned in polynomial time negative results show that certain classes cannot be learned in polynomial time approaches decision tree learning decision tree learning uses a decision tree as a predictive model which maps observations about an item to conclusions about the items target value association rule learning association rule learning is a method for discovering interesting relations between variables in large databases artificial neural networks an artificial neural network ann learning algorithm usually called neural network nn is a learning algorithm that is inspired by the structure and functional aspects of biological neural networks computations are structured in terms of an interconnected group of artificial neurons processing information using a connectionist approach to computation modern neural networks are nonlinear statistical data modeling tools they are usually used to model complex relationships between inputs and outputs to find patterns in data or to capture the statistical structure in an unknown joint probability distribution between observed variables deep learning falling hardware prices and the development of gpus for personal use in the last few years have contributed to the development of the concept of deep learning which consists of multiple hidden layers in an artificial neural network this approach tries to model the way the human brain processes light and sound into vision and hearing some successful applications of deep learning are computer vision and speech recognition inductive logic programming inductive logic programming ilp is an approach to rule learning using logic programming as a uniform representation for input examples background knowledge and hypotheses given an encoding of the known background knowledge and a set of examples represented as a logical database of facts an ilp system will derive a hypothesized logic program that entails all positive and no negative examples inductive programming is a related field that considers any kind of programming languages for representing hypotheses and not only logic programming such as functional programs support vector machines support vector machines svms are a set of related supervised learning methods used for classification and regression given a set of training examples each marked as belonging to one of two categories an svm training algorithm builds a model that predicts whether a new example falls into one category or the other clustering cluster analysis is the assignment of a set of observations into subsets called clusters so that observations within the same cluster are similar according to some predesignated criterion or criteria while observations drawn from different clusters are dissimilar different clustering techniques make different assumptions on the structure of the data often defined by some similarity metric and evaluated for example by internal compactness similarity between members of the same cluster and separation between different clusters other methods are based on estimated density and graph connectivity clustering is a method of unsupervised learning and a common technique for statistical data analysis bayesian networks a bayesian network belief network or directed acyclic graphical model is a probabilistic graphical model that represents a set of random variables and their conditional independencies via a directed acyclic graph dag for example a bayesian network could represent the probabilistic relationships between diseases and symptoms given symptoms the network can be used to compute the probabilities of the presence of various diseases efficient algorithms exist that perform inference and learning reinforcement learning reinforcement learning is concerned with how an agent ought to take actions in an environment so as to maximize some notion of longterm reward reinforcement learning algorithms attempt to find a policy that maps states of the world to the actions the agent ought to take in those states reinforcement learning differs from the supervised learning problem in that correct inputoutput pairs are never presented nor suboptimal actions explicitly corrected representation learning several learning algorithms mostly unsupervised learning algorithms aim at discovering better representations of the inputs provided during training classical examples include principal components analysis and cluster analysis representation learning algorithms often attempt to preserve the information in their input but transform it in a way that makes it useful often as a preprocessing step before performing classification or predictions allowing reconstruction of the inputs coming from the unknown data generating distribution while not being necessarily faithful for configurations that are implausible under that distribution manifold learning algorithms attempt to do so under the constraint that the learned representation is lowdimensional sparse coding algorithms attempt to do so under the constraint that the learned representation is sparse has many zeros multilinear subspace learning algorithms aim to learn lowdimensional representations directly from tensor representations for multidimensional data without reshaping them into highdimensional vectors deep learning algorithms discover multiple levels of representation or a hierarchy of features with higherlevel more abstract features defined in terms of or generating lowerlevel features it has been argued that an intelligent machine is one that learns a representation that disentangles the underlying factors of variation that explain the observed data similarity and metric learning in this problem the learning machine is given pairs of examples that are considered similar and pairs of less similar objects it then needs to learn a similarity function or a distance metric function that can predict if new objects are similar it is sometimes used in recommendation systems sparse dictionary learning in this method a datum is represented as a linear combination of basis functions and the coefficients are assumed to be sparse let x be a ddimensional datum d be a d by n matrix where each column of d represents a basis function r is the coefficient to represent x using d mathematically sparse dictionary learning means solving x d r displaystyle xapprox dr where r is sparse generally speaking n is assumed to be larger than d to allow the freedom for a sparse representation learning a dictionary along with sparse representations is strongly nphard and also difficult to solve approximately a popular heuristic method for sparse dictionary learning is ksvd sparse dictionary learning has been applied in several contexts in classification the problem is to determine which classes a previously unseen datum belongs to suppose a dictionary for each class has already been built then a new datum is associated with the class such that its best sparsely represented by the corresponding dictionary sparse dictionary learning has also been applied in image denoising the key idea is that a clean image patch can be sparsely represented by an image dictionary but the noise cannot genetic algorithms a genetic algorithm ga is a search heuristic that mimics the process of natural selection and uses methods such as mutation and crossover to generate new genotype in the hope of finding good solutions to a given problem in machine learning genetic algorithms found some uses in the NUMBER s and NUMBER s conversely machine learning techniques have been used to improve the performance of genetic and evolutionary algorithms rulebased machine learning rulebased machine learning is a general term for any machine learning method that identifies learns or evolves rules to store manipulate or apply knowledge the defining characteristic of a rulebased machine learner is the identification and utilization of a set of relational rules that collectively represent the knowledge captured by the system this is in contrast to other machine learners that commonly identify a singular model that can be universally applied to any instance in order to make a prediction rulebased machine learning approaches include learning classifier systems association rule learning and artificial immune systems learning classifier systems learning classifier systems lcs are a family of rulebased machine learning algorithms that combine a discovery component e g typically a genetic algorithm with a learning component performing either supervised learning reinforcement learning or unsupervised learning they seek to identify a set of contextdependent rules that collectively store and apply knowledge in a piecewise manner in order to make predictions applications applications for machine learning includein NUMBER the online movie company netflix held the first netflix prize competition to find a program to better predict user preferences and improve the accuracy on its existing cinematch movie recommendation algorithm by at least NUMBER a joint team made up of researchers from att labsresearch in collaboration with the teams big chaos and pragmatic theory built an ensemble model to win the grand prize in NUMBER for NUMBER million shortly after the prize was awarded netflix realized that viewers ratings were not the best indicators of their viewing patterns everything is a recommendation and they changed their recommendation engine accordingly in NUMBER the wall street journal wrote about the firm rebellion research and their use of machine learning to predict the financial crisis in NUMBER cofounder of sun microsystems vinod khosla predicted that NUMBER of medical doctors jobs would be lost in the next two decades to automated machine learning medical diagnostic software in NUMBER it has been reported that a machine learning algorithm has been applied in art history to study fine art paintings and that it may have revealed previously unrecognized influences between artists model assessments classification machine learning models can be validated by accuracy estimation techniques like the holdout method which splits the data in a training and test set conventionally NUMBER training set and NUMBER test set designation and evaluates the performance of the training model on the test set in comparison the nfoldcrossvalidation method randomly splits the data in k subsets where the kNUMBER instances of the data are used to train the model while the kth instance is used to test the predictive ability of the training model in addition to the holdout and crossvalidation methods bootstrap which samples n instances with replacement from the dataset can be used to assess model accuracy in addition to overall accuracy investigators frequently report sensitivity and specificity meaning true positive rate tpr and true negative rate tnr respectively similarly investigators sometimes report the false positive rate fpr as well as the false negative rate fnr however these rates are ratios that fail to reveal their numerators and denominators the total operating characteristic toc is an effective method to express a models diagnostic ability toc shows the numerators and denominators of the previously mentioned rates thus toc provides more information than the commonly used receiver operating characteristic roc and rocs associated area under the curve auc ethics machine learning poses a host of ethical questions systems which are trained on datasets collected with biases may exhibit these biases upon use algorithmic bias thus digitizing cultural prejudices for example using job hiring data from a firm with racist hiring policies may lead to a machine learning system duplicating the bias by scoring job applicants against similarity to previous successful applicants responsible collection of data and documentation of algorithmic rules used by a system thus is a critical part of machine learning because language contains biases machines trained on language corpora will necessarily also learn bias software software suites containing a variety of machine learning algorithms include the following free and opensource software proprietary software with free and opensource editions proprietary software journals journal of machine learning researchmachine learningneural computation conferences conference on neural information processing systemsinternational conference on machine learninginternational conference on learning representations see also references further reading external links international machine learning societypopular online course by andrew ng at coursera it uses gnu octave the course is a free version of stanford universitys actual course taught by ng whose lectures are also available for free mloss is an academic database of opensource machine learning software
3,53587467,Outline of machine learning,Machine learning,the following outline is provided as an overview of and topical guide to machine learningmachine learning subfield of computer science more particularly soft computing that evolved from the study of pattern recognition and computational learning theory in artificial intelligence in NUMBER arthur samuel defined machine learning as a field of study that gives computers the ability to learn without being explicitly programmed machine learning explores the study and construction of algorithms that can learn from and make predictions on data such algorithms operate by building a model from an example training set of input observations in order to make datadriven predictions or decisions expressed as outputs rather than following strictly static program instructions what type of thing is machine learning an academic disciplinea branch of sciencean applied sciencea subfield of computer sciencea branch of artificial intelligencea subfield of soft computing branches of machine learning subfields of machine learning subfields of machine learningcomputational learning theory studying the design and analysis of machine learning algorithms grammar inductionmeta learning crossdisciplinary fields involving machine learning crossdisciplinary fields involving machine learningadversarial machine learningpredictive analyticsquantum machine learningrobot learningdevelopmental robotics applications of machine learning applications of machine learningbiomedical informaticscomputer visioncustomer relationship management data miningemail filteringinverted pendulum balance and equilibrium system natural language processing nlpautomatic summarizationautomatic taxonomy constructiondialog systemgrammar checkerlanguage recognitionhandwriting recognitionoptical character recognitionspeech recognitionmachine translationquestion answeringspeech synthesistext miningterm frequencyinverse document frequency tfidftext simplificationpattern recognitionfacial recognition systemhandwriting recognitionimage recognitionoptical character recognitionspeech recognitionrecommendation systemcollaborative filteringcontentbased filteringhybrid recommender systems collaborative and contentbased filteringsearch enginesearch engine optimization machine learning hardware machine learning hardwaregraphics processing unittensor processing unitvision processing unit machine learning tools machine learning tools listcomparison of deep learning softwarecomparison of deep learning softwareresources machine learning frameworks machine learning framework proprietary machine learning frameworks proprietary machine learning frameworksamazon machine learningmicrosoft azure machine learning studiodistbelief replaced by tensorflowmicrosoft cognitive toolkit open source machine learning frameworks open source machine learning frameworksapache singacaffehNUMBER omlpacktensorflowtorchaccord net machine learning libraries machine learning library listdeeplearningNUMBER jtheanoscikitlearn machine learning algorithms machine learning algorithm types of machine learning algorithms almeidapineda recurrent backpropagationalopexalmeidapineda recurrent backpropagationbackpropagationbootstrap aggregatingcnNUMBER algorithmconstructing skill treesdehaenechangeux modeldiffusion mapdominancebased rough set approachdynamic time warpingerrordriven learningevolutionary multimodal optimizationexpectationmaximization algorithmfasticaforwardbackward algorithmgenerecgenetic algorithm for rule set productiongrowing selforganizing maphexqhyper basis function networkidistanceknearest neighbors algorithmkernel methods for vector outputkernel principal component analysisleabralindebuzogray algorithmlocal outlier factorlogic learning machinelogitboostmanifold alignmentminimum redundancy feature selectionmixture of expertsmultiple kernel learningnonnegatiasdfasdfssdfasdfve matrix factorizationonline machine learningoutofbag errorprefrontal cortex basal ganglia working memorypvlvqlearningquadratic unconstrained binary optimizationquerylevel featurequickpropradial basis function networkrandomized weighted majority algorithmreinforcement learningrepeated incremental pruning to produce error reduction ripperrproprulebased machine learningskill chainingsparse pcastateactionrewardstateactionstochastic gradient descentstructured knntdistributed stochastic neighbor embeddingtemporal difference learningwakesleep algorithmweighted majority algorithm machine learning machine learning methods machine learning method listinstancebased algorithmknearest neighbors algorithm knnlearning vector quantization lvqselforganizing map somregression analysislogistic regressionordinary least squares regression olsrlinear regressionstepwise regressionmultivariate adaptive regression splines marsregularization algorithmridge regressionleast absolute shrinkage and selection operator lassoelastic netleastangle regression larsclassifiersprobabilistic classifiernaive bayes classifierbinary classifierlinear classifierhierarchical classifier dimensionality reduction dimensionality reductioncanonical correlation analysis ccafactor analysisfeature extractionfeature selectionindependent component analysis icalinear discriminant analysis ldamultidimensional scaling mdsnonnegative matrix factorization nmfpartial least squares regression plsrprincipal component analysis pcaprincipal component regression pcrprojection pursuitsammon mappingtdistributed stochastic neighbor embedding tsne ensemble learning ensemble learningadaboostboostingbootstrap aggregating baggingensemble averaging process of creating multiple models and combining them to produce a desired output as opposed to creating just one model frequently an ensemble of models performs better than any individual model because the various errors of the models average out gradient boosted decision tree gbrtgradient boosting machine gbmrandom foreststacked generalization blending meta learning meta learninginductive biasmetadata reinforcement learning reinforcement learningqlearningstateactionrewardstateaction sarsatemporal difference learning tdlearning automata supervised learning supervised learningaodeartificial neural networkassociation rule learning algorithmsapriori algorithmeclat algorithmcasebased reasoninggaussian process regressiongene expression programminggroup method of data handling gmdhinductive logic programminginstancebased learninglazy learninglearning automatalearning vector quantizationlogistic model treeminimum message length decision trees decision graphs etc nearest neighbor algorithmanalogical modelingprobably approximately correct learning pac learningripple down rules a knowledge acquisition methodologysymbolic machine learning algorithmssupport vector machinesrandom forestsensembles of classifiersbootstrap aggregating baggingboosting metaalgorithmordinal classificationinformation fuzzy networks ifnconditional random fieldanovaquadratic classifiersknearest neighborboostingsprintbayesian networksnaive bayeshidden markov modelshierarchical hidden markov model bayesian bayesian statisticsbayesian knowledge basenaive bayesgaussian naive bayesmultinomial naive bayesaveraged onedependence estimators aodebayesian belief network bbnbayesian network bn decision tree algorithms decision tree algorithmdecision treeclassification and regression tree cartiterative dichotomiser NUMBER idNUMBER cNUMBER NUMBER algorithmcNUMBER NUMBER algorithmchisquared automatic interaction detection chaiddecision stumpconditional decision treeidNUMBER algorithmrandom forestsliq linear classifier linear classifierfishers linear discriminantlinear regressionlogistic regressionmultinomial logistic regressionnaive bayes classifierperceptronsupport vector machine unsupervised learning unsupervised learningexpectationmaximization algorithmvector quantizationgenerative topographic mapinformation bottleneck method artificial neural networks artificial neural networkfeedforward neural networkextreme learning machinelogic learning machineselforganizing map association rule learning association rule learningapriori algorithmeclat algorithmfpgrowth algorithm hierarchical clustering hierarchical clusteringsinglelinkage clusteringconceptual clustering cluster analysis cluster analysisbirchdbscanexpectationmaximization emfuzzy clusteringhierarchical clusteringkmeans algorithmkmeans clusteringkmediansmeanshiftoptics algorithm anomaly detection anomaly detectionknearest neighbors classification knnlocal outlier factor semisupervised learning semisupervised learningactive learning special case of semisupervised learning in which a learning algorithm is able to interactively query the user or some other information source to obtain the desired outputs at new data points generative modelslowdensity separationgraphbased methodscotrainingtransduction deep learning deep learningdeep belief networksdeep boltzmann machinesdeep convolutional neural networksdeep recurrent neural networkshierarchical temporal memorydeep boltzmann machine dbmstacked autoencoders other machine learning methods and problems anomaly detectionassociation rulesbiasvariance dilemmaclassificationmultilabel classificationclusteringdata preprocessingempirical risk minimizationfeature engineeringfeature learninglearning to rankoccam learningonline machine learningpac learningregressionreinforcement learningsemisupervised learningstatistical learningstructured predictiongraphical modelsbayesian networkconditional random field crfhidden markov model hmmunsupervised learningvc theory machine learning research machine learning researchlist of artificial intelligence projectslist of datasets for machine learning research history of machine learning history of machine learningtimeline of machine learning machine learning projects machine learning projectsdeepmindgoogle brain machine learning organizations machine learning organizationsknowledge engineering and machine learning group machine learning conferences and workshops artificial intelligence and security aisec colocated workshop with ccsconference on neural information processing systems nipsecml pkddinternational conference on machine learning icml machine learning publications books on machine learning books about machine learning machine learning journals machine learningjournal of machine learning research jmlrneural computation persons influential in machine learning alberto broggiandrei knyazevandrew mccallumandrew ngarmin b cremersayanna howardbarney pellben goertzelben taskarbernhard schlkopfbrian d ripleychristopher g atkesoncorinna cortesdemis hassabisdouglas lenateric xingernst dickmannsgeoffrey hinton coinventor of the backpropagation and contrastive divergence training algorithmshanspeter kriegelhartmut nevenheikki mannilajacek m zuradajaime carbonelljerome h friedmanjohn d laffertyjohn platt invented smo and platt scalingjulie beth lovinsjrgen schmidhuberkarl steinbuchkatia sycaraleo breiman invented bagging and random forestslise getoorluca maria gambardellalon bottoumarcus huttermehryar mohrimichael collinsmichael i jordanmichael l littmannando de freitasofer dekeloren etzionipedro domingospeter flachpierre baldipushmeet kohliray kurzweilrayid ghaniross quinlansalvatore j stolfosebastian thrunselmer bringsjordsepp hochreitershane leggstephen muggletonsteve omohundrotom m mitchelltrevor hastievasant honavarvladimir vapnik coinventor of the svm and vc theoryyann lecun invented convolutional neural networksyasuo matsuyamayoshua bengiozoubin ghahramani see also outline of artificial intelligenceoutline of computer visionoutline of natural language processingoutline of roboticsaccuracy paradoxaction model learningactivation functionactivity recognitionadalineadaptive neuro fuzzy inference systemadaptive resonance theoryadditive smoothingadjusted mutual informationaika softwareaivaaixialchemyapialexnetalgorithm selectionalgorithmic inferencealgorithmic learning theoryalphagoalphago zeroalternating decision treeapprenticeship learningcausal markov conditioncompetitive learningconcept learningdecision tree learningdistribution learning theoryeager learningendtoend reinforcement learningerror tolerance pac learningexplanationbased learningfeatureglovehyperparameteribm machine learning hubinferential theory of learninglearning automatalearning classifier systemlearning rulelearning with errorsmtheory learning frameworkmachine learning controlmachine learning in bioinformaticsmarginmarkov chain geostatisticsmarkov chain monte carlo mcmcmarkov information sourcemarkov logic networkmarkov modelmarkov random fieldmarkovian discriminationmaximumentropy markov modelmultiarmed banditmultitask learningmultilinear subspace learningmultimodal learningmultiple instance learningmultipleinstance learningneverending language learningoffline learningparity learningpopulationbased incremental learningpredictive learningpreference learningproactive learningproximal gradient methods for learningsemantic analysissimilarity learningsparse dictionary learningstability learning theorystatistical learning theorystatistical relational learningtanagratransfer learningvariableorder markov modelversion space learningwaffleswekaloss functionloss functions for classificationmean squared error msemean squared prediction error mspetaguchi loss functionlowenergy adaptive clustering hierarchy other anne otateant colony optimization algorithmsanthony levandowskiantiunification computer scienceapache flumeapache giraphapache mahoutapache singaapache sparkapache systemmlaphelion softwarearabic speech corpusarchetypal analysisarthur zimekartificial antsartificial bee colony algorithmartificial developmentartificial immune systemastrostatisticsaveraged onedependence estimatorsbagofwords modelbalanced clusteringball treebase ratebat algorithmbaumwelch algorithmbayesian hierarchical modelingbayesian interpretation of kernel regularizationbayesian optimizationbayesian structural time seriesbees algorithmbehavioral clusteringbernoulli schemebiasvariance tradeoffbiclusteringbinarization of consensus partition matricesbinary classificationbing predictsbioinspired computingbiogeographybased optimizationbiplotbondys theorembongard problembradleyterry modelbrownboostbrown clusteringburst errorcbcl mitciml community portalcmaescure data clustering algorithmcache language modelcalibration statisticscanonical correspondence analysiscanopy clustering algorithmcascading classifierscategory utilitycellcognitioncellular evolutionary algorithmchisquare automatic interaction detectionchromosome genetic algorithmclassifier chainscleverbotclonal selection algorithmclusterweighted modelingclustering highdimensional dataclustering illusioncoboostingcobweb clusteringcognitive computercognitive roboticscollostructional analysiscommonmethod variancecompletelinkage clusteringcomputerautomated designconcept classconcept driftconference on artificial general intelligenceconference on knowledge discovery and data miningconfirmatory factor analysisconfusion matrixcongruence coefficientconnect computer systemconsensus clusteringconstrained clusteringconstrained conditional modelconstructive cooperative coevolutioncorrelation clusteringcorrespondence analysiscorticacoupled pattern learnercrossentropy methodcrossvalidation statisticscrossover genetic algorithmcuckoo searchcultural algorithmcultural consensus theorycurse of dimensionalitydadispdarpa lagr programdarkforestdartmouth workshopdarwintunesdata mining extensionsdata explorationdata preprocessingdata stream clusteringdataikudaviesbouldin indexdecision boundarydecision listdecision tree modeldeductive classifierdeepartdeepdreamdeep web technologiesdefining lengthdendrogramdependability state modeldetailed balancedetermining the number of clusters in a data setdetrended correspondence analysisdevelopmental roboticsdiffbotdifferential evolutiondiscrete phasetype distributiondiscriminative modeldissociated pressdistributed rdlibdocument classificationdocumenting hatedomain adaptationdoubly stochastic modeldualphase evolutiondunn indexdynamic bayesian networkdynamic markov compressiondynamic topic modeldynamic unobserved effects modeledlutelkiedge recombination operatoreffective fitnesselastic mapelastic matchingelbow method clusteringemergent softwareencogentropy rateerkki ojaeuriskoeuropean conference on artificial intelligenceevaluation of binary classifiersevolution strategyevolution windowevolutionary algorithm for landmark detectionevolutionary algorithmevolutionary artevolutionary musicevolutionary programmingevolvability computer scienceevolved antennaevolver softwareevolving classification functionexpectation propagationexploratory factor analysisfNUMBER scoreflame clusteringfactor analysis of mixed datafactor graphfactor regression modelfactored language modelfarthestfirst traversalfastandfrugal treesfeature selection toolboxfeature hashingfeature scalingfeature vectorfirefly algorithmfirstdifference estimatorfirstorder inductive learnerfish school searchfisher kernelfitness approximationfitness functionfitness proportionate selectionfluentdfoldinghomeformal concept analysisforward algorithmfowlkesmallows indexfrederick jelinekfrrolefunctional principal component analysisgattoglimmergary bryce fogelgaussian adaptationgaussian processgaussian process emulatorgene predictiongeneral architecture for text engineeringgeneralization errorgeneralized canonical correlationgeneralized filteringgeneralized iterative scalinggeneralized multidimensional scalinggenerative adversarial networkgenerative modelgenetic algorithmgenetic algorithm schedulinggenetic algorithms in economicsgenetic fuzzy systemsgenetic memory computer sciencegenetic operatorgenetic programminggenetic representationgeographical clustergesture description languagegeworkbenchglossary of artificial intelligenceglottochronologygolem ilpgoogle matrixgrafting decision treesgramian matrixgrammatical evolutiongranular computinggraphlabgraph kernelgregory john boylegremlin programming languagegrowth functionhumant humanoid ant algorithmhammersleyclifford theoremharmony searchhebbian theoryhidden markov random fieldhidden semimarkov modelhierarchical hidden markov modelhigherorder factor analysishighway networkhinge losshollands schema theoremhopkins statistichoshenkopelman algorithmhuber lossircfNUMBER ian goodfellowilastikilya sutskeverimmunocomputingimperialist competitive algorithminauthentic textincremental decision treeinduction of regular languagesinductive biasinductive probabilityinductive programminginfluence diagraminformation harvestinginformation fuzzy networksinformation gain in decision treesinformation gain ratioinheritance genetic algorithminstance selectionintel realsenseinteracting particle systeminteractive machine translationinternational joint conference on artificial intelligenceinternational meeting on computational intelligence methods for bioinformatics and biostatisticsinternational semantic web conferenceiris flower data setisland algorithmisotropic positionitem response theoryiterative viterbi decodingjoonejabberwackyjaccard indexjackknife variance estimates for random forestjava grammatical evolutionjoseph nechvataljubatusjulia programming languagejunction tree algorithmksvdkmeanskmedians clusteringkmedoidsknimekxen inc k qflatskagglekalman filterkatzs backoff modelkeraskernel adaptive filterkernel density estimationkernel eigenvoicekernel embedding of distributionskernel methodkernel perceptronkernel random forestkinectklausrobert mllerkneserney smoothingknowledge vaultknowledge integrationlibsvmlpboostlabeled datalanguagewarelanguage acquisition device computerlanguage identification in the limitlanguage modellarge margin nearest neighborlatent dirichlet allocationlatent class modellatent semantic analysislatent variablelatent variable modellattice minerlayered hidden markov modellearnable function classleast squares support vector machineleaveoneout errorleslie p kaelblinglinear genetic programminglinear predictor functionlinear separabilitylingyun gulinkuriouslior ron business executivelist of genetic algorithm applicationslist of metaphorbased metaheuristicslist of text mining softwarelocal casecontrol samplinglocal independencelocal tangent space alignmentlocalitysensitive hashingloglinear modellogistic model treelowrank approximationlowrank matrix approximationsmatlabmimic immunologymxnetmallet software projectmanifold regularizationmargininfused relaxed algorithmmargin classifiermark v shaneymassive online analysismatrix regularizationmatthews correlation coefficientmean shiftmean squared errormean squared prediction errormeasurement invariancemedoidmeemixmelomicsmemetic algorithmmetaoptimizationmexican international conference on artificial intelligencemichael kearns computer scientistminhashmixture modelmlpymodels of dna evolutionmoral graphmountain car problemmovidiusmultiarmed banditmultilabel classificationmulti expression programmingmulticlass classificationmultidimensional analysismultifactor dimensionality reductionmultilinear principal component analysismultiple correspondence analysismultiple discriminant analysismultiple factor analysismultiple sequence alignmentmultiplicative weight update methodmultispectral pattern recognitionmutation genetic algorithmmysteryvibengramnominate scaling methodnativelanguage identificationnatural language toolkitnatural evolution strategynearestneighbor chain algorithmnearest centroid classifiernearest neighbor searchneighbor joiningnest labsnetminernetowlneural designerneural engineering objectneural labneural modeling fieldsneural network softwareneurosolutionsneuro laboratoryneuroevolutionneurophniki ainoisy channel modelnoisy text analyticsnonlinear dimensionality reductionnovelty detectionnuisance variablenumentaoneclass classificationonnxopennlpoptimal discriminant analysisoracle data miningorange softwareordination statisticsoverfittingprogolpsipredpachinko allocationpagerankparallel metaheuristicparity benchmarkpartofspeech taggingparticle swarm optimizationpath dependencepattern language formal languagespeltarion synapseperplexitypersian speech corpuspicas apppietro peronapipeline pilotpiranha softwarepitmanyor processplate notationpolynomial kernelpop music automationpopulation processportable format for analyticspredictive model markup languagepredictive state representationpreference regressionpremature convergenceprincipal geodesic analysisprior knowledge for pattern recognitionprisma appprobabilistic action coresprobabilistic contextfree grammarprobabilistic latent semantic analysisprobabilistic soft logicprobability matchingprobit modelproduct of expertsprogramming with big data in rproper generalized decompositionpruning decision treespushpak bhattacharyyaq methodologyqlooquality control and genetic algorithmsquantum artificial intelligence labqueueing theoryquick drawr programming languagerada mihalcearademacher complexityradial basis function kernelrand indexrandom indexingrandom projectionrandom subspace methodranking svmrapidminerrattle guiraymond cattellreasoning systemregularization perspectives on support vector machinesrelational data miningrelationship squarerelevance vector machinerelief feature selectionrenjinrepertory gridrepresenter theoremrewardbased selectionrichard zemelright to explanationroboearthrobust principal component analysisruleml symposiumrule inductionrules extraction system familysas softwaresnnsspss modelersubclusample complexitysample exclusion dimensionsanta fe trail problemsavi technologyschema genetic algorithmssearchbased software engineeringselection genetic algorithmselfservice semantic suitesemantic foldingsemantic mapping statisticssemidefinite embeddingsense networkssensorium projectsequence labelingsequential minimal optimizationshattered setshogun toolboxsilhouette clusteringsimhashsimranksimilarity measuresimple matching coefficientsimultaneous localization and mappingsinkov statisticskymindsliced inverse regressionsmartmatchsnakes and ladderssoft independent modelling of class analogiessoft output viterbi algorithmsolomonoffs theory of inductive inferencesolveit softwarespectral clusteringspikeandslab variable selectionstatistical machine translationstatistical parsingstatistical semanticsstefano soattostephen wolframstochastic block modelstochastic cellular automatonstochastic diffusion searchstochastic grammarstochastic matrixstochastic universal samplingstress majorizationstring kernelstructural equation modelingstructural risk minimizationstructured sparsity regularizationstructured support vector machinesubclass reachabilitysufficient dimension reductionsukhotins algorithmsum of absolute differencessum of absolute transformed differencesswarm intelligenceswitching kalman filtersymbolic regressionsynchronous contextfree grammarsyntactic pattern recognitiontdgammontimitteaching dimensionteuvo kohonentextual casebased reasoningtheory of conjoint measurementthomas g dietterichthurstonian modeltopic modeltournament selectiontraining test and validation setstransiogramtrax image recognitiontrigram taggertruncation selectiontucker decompositionuimaupgmaugly duckling theoremuncertain datauniform convergence in probabilityunique negative dimensionuniversal portfolio algorithmuser behavior analyticsvc dimensionvgg image annotatorvigravalidation setvapnikchervonenkis theoryvariableorder bayesian networkvariable kernel density estimationvariable rules analysisvariational message passingvarimax rotationvector quantizationvicarious companyviterbi algorithmvowpal wabbitwaca clustering algorithmwpgmawards methodweasel programwhitening transformationwinnow algorithmwinstay loseswitchwitness setwolfram languagewolfram mathematicawriter invariantxgboostyooreekazeroth software further reading trevor hastie robert tibshirani and jerome h friedman NUMBER the elements of statistical learning springer isbn NUMBER pedro domingos september NUMBER the master algorithm basic books isbn NUMBER mehryar mohri afshin rostamizadeh ameet talwalkar NUMBER foundations of machine learning the mit press isbn NUMBER ian h witten and eibe frank NUMBER data mining practical machine learning tools and techniques morgan kaufmann NUMBER pp isbn NUMBER david j c mackay information theory inference and learning algorithms cambridge cambridge university press NUMBER isbn NUMBER richard o duda peter e hart david g stork NUMBER pattern classification NUMBER nd edition wiley new york isbn NUMBER christopher bishop NUMBER neural networks for pattern recognition oxford university press isbn NUMBER vladimir vapnik NUMBER statistical learning theory wileyinterscience isbn NUMBER ray solomonoff an inductive inference machine ire convention record section on information theory part NUMBER pp NUMBER NUMBER ray solomonoff an inductive inference machine a privately circulated report from the NUMBER dartmouth summer research conference on ai references external links data science data to insights from mit machine learninginternational machine learning societypopular online course by andrew ng at coursera it uses gnu octave the course is a free version of stanford universitys actual course taught by ng whose lectures are also available for free mloss is an academic database of opensource machine learning software


In [694]:
ml_json = ml_df_no_ns.to_json()

In [835]:
bs_json = bs_df_no_ns.to_json()

In [726]:
# for key in ml_dict.keys():
#     if type(key) is not str:
#         try:
#             ml_dict[str(key)] = ml_dict[key]
#         except:
#             try:
#                 ml_dict[repr(key)] = ml_dict[key]
#             except:
#                 pass
#         del ml_dict[key]

In [778]:
ml_dict.keys()

dict_keys(['pageid', 'title', 'category', 'text'])

In [794]:

from collections import OrderedDict
data = json.loads(ml_json, object_pairs_hook=OrderedDict)

In [836]:
data_2 = json.loads(bs_json, object_pairs_hook=OrderedDict)

In [796]:
data.keys()

odict_keys(['pageid', 'title', 'category', 'text'])

In [837]:
data_2.keys()

odict_keys(['pageid', 'title', 'category', 'text'])

### MONGO

In [1]:
!conda install --yes --quiet pymongo



Package plan for installation in environment /opt/conda:

The following NEW packages will be INSTALLED:

    libgfortran-ng: 7.2.0-h9f7466a_2  defaults   
    libopenblas:    0.2.20-hae245c1_3 defaults   
    pymongo:        3.4.0-py36_0      defaults   

The following packages will be DOWNGRADED:

    numexpr:        2.6.4-py36_0      conda-forge --> 2.6.2-py36_nomklhd302951_1 defaults [nomkl]



In [2]:
import pymongo

In [754]:
client = pymongo.MongoClient('35.167.189.162', 27016)

### Databases on Our MongoDB Server

In [786]:
coll_ref.drop()

In [787]:
client.database_names()

['admin', 'local', 'test', 'twitter']

In [788]:
test_db = client.test


### Make a new Database called `project4_database`


In [789]:
db_ref = client.project4_database

#### Create a reference to `my_project4_collection`

In [790]:
coll_ref = db_ref.my_project4_collection

In [839]:
coll_ref_bs = db_ref.my_project4_collection_bs

#### Show `databases` and the `collections`

In [840]:
client.database_names(), db_ref.collection_names()

(['admin', 'local', 'project4_database', 'test', 'twitter'],
 ['my_project4_collection'])

In [797]:

mongo_ml = coll_ref.insert_one(data)


In [841]:
mongo_bs = coll_ref_bs.insert_one(data_2)

In [817]:
cursor = coll_ref.find_one()

cursor

sample_docs = (cursor)

In [842]:
cursor_2 = coll_ref_bs.find_one()

cursor_2

sample_docs_2 = (cursor_2)

In [818]:
type(sample_docs)

dict

In [819]:
sample_docs.keys()

dict_keys(['_id', 'pageid', 'title', 'category', 'text'])

In [843]:
sample_docs_2.keys()

dict_keys(['_id', 'pageid', 'title', 'category', 'text'])

In [820]:
from_mongo_ml_df = pd.DataFrame(sample_docs)

In [844]:
from_mongo_bs_df = pd.DataFrame(sample_docs_2)

In [822]:
from_mongo_ml_df.shape

(1075, 5)

In [845]:
from_mongo_bs_df.shape

(1918, 5)

In [846]:
from_mongo_bs_df.head()

Unnamed: 0,_id,category,pageid,text,title
0,5a165d59608b9000b627015d,Business software,1037763,business software or a business application is any software or set of computer programs used by business users to perform various business functions these business applications are used to increase productivity to measure productivity and to perform other business functions accurately by and large business software is likely to be developed to meet the needs of a specific business and therefore is not easily transferable to a different business environment unless its nature and operation is identical due to the unique requirements of each business offtheshelf software is unlikely to completely address a companys needs however where an ontheshelf solution is necessary due to time or monetary considerations some level of customization is likely to be required exceptions do exist depending on the business in question and thorough research is always required before committing to bespoke or offtheshelf solutions some business applications are interactive i e they have a graphical user interface or user interface and users can querymodifyinput data and view results instantaneously they can also run reports instantaneously some business applications run in batch mode they are set up to run based on a predetermined eventtime and a business user does not need to initiate them or monitor them some business applications are built inhouse and some are bought from vendors off the shelf software products these business applications are installed on either desktops or big servers prior to the introduction of cobol a universal compiler in NUMBER businesses developed their own unique machine language rcas language consisted of a NUMBER position instruction for example to read a record into memory the first two digits would be the instruction action code the next four positions of the instruction an a address would be the exact leftmost memory location where you want the readable character to be placed four positions a b address of the instruction would note the very rightmost memory location where you want the last character of the record to be located a two digit b address also allows a modification of any instruction instruction codes and memory designations excluded the use of NUMBER s or NUMBER s the first rca business application was implemented in NUMBER on a NUMBER k rca NUMBER the rca NUMBER mid frame NUMBER and large frame NUMBER began their marketing in early NUMBER many kinds of users are found within the business environment and can be categorized by using a small medium and large matrixthe small business market generally consists of home accounting software and office suites such as openoffice org or microsoft office the medium size or small and mediumsized enterprise sme has a broader range of software applications ranging from accounting groupware customer relationship management human resource management systems outsourcing relationship management loan origination software shopping cart software field service software and other productivity enhancing applications the last segment covers enterprise level software applications such as those in the fields of enterprise resource planning enterprise content management ecm business process management bpm and product lifecycle management these applications are extensive in scope and often come with modules that either add native functions or incorporate the functionality of thirdparty computer programs technologies that previously only existed in peertopeer software applications like kazaa and napster are starting to appear within business applications types of business tools enterprise application software easresource managementdigital dashboards also known as business intelligence dashboards enterprise dashboards or executive dashboards these are visually based summaries of business data that show ataglance understanding of conditions through metrics and key performance indicators kpis dashboards are a very popular tools that have arisen in the last few years online analytical processing olap which include holap rolap and molap are a capability of some management decision support and executive information systems that support interactive examination of large amounts of data from many perspectives reporting software generates aggregated views of data to keep the management informed about the state of their business procurement software is business software that helps to automate the purchasing function of organizations data mining is the extraction of consumer information from a database by utilizing software that can isolate and identify previously unknown patterns or trends in large amounts of data there is a variety of data mining techniques that reveal different types of patterns some of the techniques that belong here are statistical methods particularly business statistics and neural networks as very advanced means of analyzing data business performance management bpmdocument management software is made for organizing and managing multiple documents of various types some of them have storage functions for security and backup of valuable business information employee scheduling software used for creating and distributing employee schedules as well as for tracking employee hours brief history the essential motivation for business software is to increase profits by cutting costs or speeding the productive cycle in the earliest days of whitecollar business automation large mainframe computers were used to tackle the most tedious jobs like bank cheque clearing and factory accounting factory accounting software was among the most popular of early business software tools and included the automation of general ledgers fixed assets inventory ledgers cost accounting ledgers accounts receivable ledgers and accounts payable ledgers including payroll life insurance health insurance federal and state insurance and retirement the early use of software to replace manual whitecollar labor was extremely profitable and caused a radical shift in whitecollar labor one computer might easily replace NUMBER whitecollar pencil pushers and the computer would not require any health or retirement benefits building on these early successes with ibm hewlettpackard and other early suppliers of business software solutions corporate consumers demanded business software to replace the oldfashioned drafting board cadcam software or computeraided drafting for computeraided manufacturing arrived in the early NUMBER s also project management software was so valued in the early NUMBER s that it might cost as much as NUMBER per copy although such software typically had far fewer capabilities than modern project management software such as microsoft project which one might purchase today for under NUMBER per copy in the early days perhaps the most noticeable widespread change in business software was the word processor because of its rapid rise the ubiquitous ibm typewriter suddenly vanished in the NUMBER s as millions of companies worldwide shifted to the use of word perfect business software and later microsoft word software another vastly popular computer program for business were mathematical spreadsheet programs such as lotus NUMBER and later microsoft excel in the NUMBER s business shifted massively towards globalism with the appearance of sap software which coordinates a supplychain of vendors potentially worldwide for the most efficient streamlined operation of factory manufacture yet nothing in the history of business software has had the global impact of the internet with its email and websites that now serve commercial interests worldwide globalism in business fully arrived when the internet became a household word the next phase in the evolution of business software is being led by the emergance of robotic process automation rpa which involves identifying and automating highly repetitive tasks and processes with an aim to drive operational efficiency reduce costs and limit human error industries that have been in the forefront of rpa adoption include the insurance industry banking and financial services the legal industry and the healthcare industry application support business applications are built based on the requirements from the business users also these business applications are built to use certain kind of business transactions or data items these business applications run flawlessly until there are no new business requirements or there is no change in underlying business transactions also the business applications run flawlessly if there are no issues with computer hardware computer networks intenetintranet computer disks power supplies and various software components middleware database computer programs etc business applications can fail when an unexpected error occurs this error could occur due to a data error an unexpected data input or a wrong data input an environment error an in frastructure related error a programming error a human error or a work flow error when a business application fails one needs to fix the business application error as soon as possible so that the business users can resume their work this work of resolving business application errors is known as business application support reporting errors the business user calls the business application support team phone number or sends an email to the business application support team the business application support team gets all the details of the error from the business user on the phone or from the email these details are then entered in a tracking software the tracking software creates a request number and this request number is given to the business user this request number is used to track the progress on the support issue the request is assigned to a support team member notification of errors for critical business application errors such as an application not available or an application not working correctly an email is sent to the entire organization or impacted teams so that they are aware of the issue they are also provided with an estimated time for application availability investigation or analysis of application errors the business application support team member collects all the necessary information about the business software error this information is then recorded in the support request all of the data used by the business user is also used in the investigation the application program is reviewed for any possible programming errors error resolution if any similar business application errors occurred in the past then the issue resolution steps are retrieved from the support knowledge base and the error is resolved using those steps if it is a new support error then new issue resolution steps are created and the error is resolved the new support error resolution steps are recorded in the knowledge base for future use for major business application errors critical infrastructure or application failures a phone conference call is initiated and all required support personsteams join the call and they all work together to resolve the error code correction if the business application error occurred due to programming errors then a request is created for the application development team to correct programming errors if the business user needs new features or functions in the business application then the required analysisdesignprogrammingtestingrelease is planned and a new version of the business software is deployed business process correction if the business application error occurred due to a work flow issue or human errors during data input then the business users are notified business users then review their work flow and revise it if necessary they also modify the user guide or user instructions to avoid such an error in the future infrastructure issue correction if the business application error occurred due to infrastructure issues then the specific infrastructure team is notified the infrastructure team then implements permanent fixes for the issue and monitors the infrastructure to avoid the reoccurrence of the same error support follow up and internal reporting the business application error tracking system is used to review all issues periodically daily weekly and monthly and reports are generated to monitor the resolved issues repeating issues and pending issues reports are also generated for the itis management for improvement and management of business applications see also references external links,Business software
1,5a165d59608b9000b627015d,Business software,41270069,accusystems llc is an american company headquartered in pueblo colorado that develops licenses supports and sells document imaging software and electronic document management primarily to the banking and finance industries over NUMBER banks currently use accusystems software in NUMBER accusystems made its first sale to peoples bank accusystems software known as accuaccount is mainly used to electronically scan store and manage loan files and any other associated paperwork in september NUMBER accusystems announced the acquisition of xtria rms financial institutions use the xtria rms software now known as tickler to track exceptions electronically in NUMBER accusystems won the banknews innovative solutions award for management software solutions in NUMBER banktech published a survey taken by accusystems that surveyed NUMBER community banks the survey found that only NUMBER of banks are using paperless systems for issuing loans the study is often used when discussing paperless systems in the uss banking industry references external links official website,AccuSystems
10,5a165d59608b9000b627015d,Business software,35959361,the architecture of interoperable information systems aios is a reference architecture for the development of interoperable enterprise information systems if enterprises or public administrations want to engage in automated business processes with other organizations their it systems must be able to work together i e they need to be interoperable the aios represents a generic building plan for these organizations to develop interoperable information systems by systematically adjusting and extending their internal information systems the aios was described in a doctoral thesis and is based on the results of various research projects on interoperability it is independent from specific products or vendors but describes generically the different layers views relationships and technical means needed to efficiently establish interoperable information systems to this aim it combines concepts from serviceoriented architecture collaborative business and business process modelling it can be seen as complementary to aris a wellknown architecture for internal information systems and business processes definition similar to the automation of processes inside organizations the automation of crossorganizational business processes is an important trend in this endeavor collaborating organizations rather strive for a loose coupling of their information systems instead of a tight integration the collaborating information systems should be able to work together but retain as much independency as possible this characteristic is also called interoperability or in the context of collaborating organizations business interoperability i e the capability of autonomous organizations to execute a collaborative business process among them information systems are systems that process information i e they capture transport transform store and offer information following the conception prevailing in information systems research an information system comprises not only the hardware and software of an enterprise but also the related human actors business functions and processes as well as organization structures this broad understanding is for example also embodied by the zachman framework architecture is defined as the fundamental organization of a system embodied in its components their relationships to each other and the environment and the principles governing its design and evolution sinz defines an information system architecture as the building plan of an information system in the sense of a specification and documentation of its components and their relationships covering all relevant viewpoints as well as the constructions rules for the creation of the building plan accordingly an architecture of interoperable information systems can be defined as the building plan of a crossorganizational information system which enables organizations to execute a collaborative business process among them background and application following the work on interoperable information systems conducted in european research projects in NUMBER the architecture of interoperable information systems aios was published as a reference for the construction of loosely coupled interoperating information systems and for the systematic modelbased enactment of collaborative business processes the aios aims primarily at large organizations that want to interoperate with each other to this aim it describes how internal information system elements can be systematically connected with the information systems of collaboration partners the main elements of the aios aredescription of the different data types comprised in interoperable information system as well as their relationships this is also called the static part or the structure of the architecture it tells organizations which information elements e g descriptions of messages exchange sequences roles and services they have to provide to collaboration partners and how they can optimally correlate these to internal elements description of different building paths for implementing or adjusting interoperable information systems this is also called the dynamic part of the architecture it tells organization how to iteratively develop the elements mentioned above concept for the technical components needed to implemenent the architecture for example design tools internal and externally visible repositories one element comprised in the third category is a biirepository in which each organization publishes the content of its business interoperability interface bii to collaboration partners since it comprises external views on information system elements it provides publishing and discovery functionalities as needed in serviceoriented architecture in the bii the externally relevant processes services organization structures etc are described on various levels of technical granularity enabling other organizations to search also for businesslevel elements and not only for technical artifacts here different from the traditional soa approach instead of one central service directory various partnerspecific repositories are implemented structure the static part of the architecture builds on three orthogonal axes enterprise dimensions levels of technical granularity and colloborative views collaborative views similar to private public and global views as known from business process and workflow modeling in the aios corresponding private public and global views on information system elements are provided the private view comprises the only internally visible information system elements the public view acts as an interface to the internal private system elements it protects internal systems and enables interoperability without the need for a significant change to the internal systems this public view describes the information system boundaries of an organization to its collaboration partners and connects internal and external information systems thereby also providing the content of the business interoperability interface of an organization the global view can be used to correlate and connect the public views of different systems enterprise dimensions to describe business processes comprehensively this axis provides distinct views on processes functions data and organizational elements in the organizational dimension roles units and other organization elements relevant for the collaboration are described and related to internal elements this ensures for example that the collaboration partners have a common understanding of the interacting roles in the data dimension document types used in the collaboration are defined and related to internally used document types in the function dimension business functions and services offered in the collaboration are described in the process dimension the processes that each organization offers are described as well as how these public processes are related to adjacent processes of partner organizations thus in combination with the axis collaborative views private public and global views on processes functions data and organizational roles are provided levels of technical granularity the description of system elements on different levels of technical granularity supports a systematic development of collaborative information systems starting with the business requirements definition and going all the way down to the code level apart from the construction aspect thereby also a multidimensional interoperability description is provided facilitating the synchronization of collaborating systems on each level similar to for example aris and omgs mda three levels are usedbusiness level here the processes to be automated are described from a technique independent level in mda this level is referred to as cim level technical level here the it concept is described therefore the models from the first level are technically enriched for example instead of business functions now components are described but still on a coarsegrained conceptual level since the models on the second level represent the basis for an automated generation of executable code they might have to be further adapted to fit implementation level constraints execution level here the models are machine interpretable and can be used during runtime in the execution of processes references,Architecture of Interoperable Information Systems
100,5a165d59608b9000b627015d,Business software,11028436,facl is a software system for management of public educational and government access peg cable television and community media centers the facl software was developed by becker software tucson arizona in an ongoing partnership with access tucson the system manages among other things contact information equipment and facilities reservations project tracking program library television channel scheduling class scheduling and registration volunteers and payments facl is specifically designed for publicaccess television and community media centers,Facíl
1000,5a165d59608b9000b627015d,Business software,25822348,microsoft dynamics erp is enterprise resource planning erp software primarily geared toward midsize organizations as well as subsidiaries and divisions of larger organizations its applications are part of microsoft dynamics a line of business management software owned and developed by microsoft microsoft dynamics erp comprises five primary products microsoft dynamics ax microsoft dynamics gp microsoft dynamics nav microsoft dynamics sl and microsoft dynamics cNUMBER products microsoft dynamics erp includes five primary productsmicrosoft dynamics ax formerly axapta is designed to help organizations do business across locations and countries by standardizing processes and helping to simplify compliance the latest version is microsoft dynamics ax NUMBER rNUMBER cuNUMBER microsoft dynamics gp formerly great plains software can help companies adapt to new opportunities and growth by managing changing markets enabling unique business requirements and connecting business processes across the organization the latest version is microsoft dynamics gp NUMBER version NUMBER NUMBER microsoft dynamics nav formerly navision is designed to help organizations streamline specialized and industryspecific business processes the latest version is microsoft dynamics nav NUMBER microsoft dynamics sl formerly solomon iv can help projectdriven organizations obtain reports and business analysis and automate projects across company divisions and locations the latest version is microsoft dynamics sl NUMBER microsoft dynamics cNUMBER formerly concorde cNUMBER can assist with finance manufacturing supply chains analytics and electronic commerce for small and mediumsized enterprises the latest version is microsoft dynamics cNUMBER NUMBER capabilities microsoft dynamics erp applications are designed to help customersmanage the entire supply chain make current financial data and reports accessible for business planning and regulatory compliance automate repetitious and routine functions so that employees can focus on more critical tasks minimize the cost and complexity of administering salaries benefits recruiting and performance management provide greater visibility into key performance factors such as profitability and potential issues meet industryspecific needs with functionality for vertical business processes microsoft dynamics erp has five focus industriesfinancial servicesmanufacturingpublic sectorretailservice industry see also list of erp software packagesmicrosoft dynamics crm references external links official website,Microsoft Dynamics ERP
