# vDataFrame.analytic

In [None]:
vDataFrame.analytic(func: str, 
                    column: str = "",
                    by: list = [], 
                    order_by = [], 
                    column2: str = "", 
                    name: str = "",
                    offset: int = 1,
                    x_smoothing: float = 0.5,
                    add_count: bool = True)

Adds a new vcolumn to the vDataFrame by using an advanced analytical function on one or two specific vcolumns.

### Parameters

<table id="parameters">
    <tr> <th>Name</th> <th>Type</th> <th>Optional</th> <th>Description</th> </tr>
    <tr> <td><div class="param_name">func</div></td> <td><div class="type">str</div></td> <td><div class = "no">&#10060;</div></td> <td>Function to use.<br>
                                                    <ul>
                                                        <li><b>beta :</b> Beta Coefficient between 2 vcolumns</li>
                                                        <li><b>count :</b> number of non-missing elements</li>
                                                        <li><b>corr :</b> Pearson correlation between 2 vcolumns</li>
                                                        <li><b>cov :</b> covariance between 2 vcolumns</li>
                                                        <li><b>ema :</b> exponential moving average</li>
                                                        <li><b>first_value :</b> first non null lead</li>
                                                        <li><b>iqr :</b> interquartile range</li>
                                                        <li><b>dense_rank :</b> dense rank</li>
                                                        <li><b>kurtosis :</b> kurtosis</li>
                                                        <li><b>jb :</b> Jarque Bera index </li>
                                                        <li><b>lead :</b> next element</li>
                                                        <li><b>lag :</b> previous element</li>
                                                        <li><b>last_value :</b> first non null lag</li>
                                                        <li><b>mad :</b> median absolute deviation</li>
                                                        <li><b>mae :</b> mean absolute error (deviation)</li>
                                                        <li><b>max :</b> maximum</li>
                                                        <li><b>mean :</b> average</li>
                                                        <li><b>median :</b> median</li>
                                                        <li><b>min :</b> min</li>
                                                        <li><b>mode :</b> most occurent element</li>
                                                        <li><b>q% :</b> q quantile (ex: 50% for the median)</li>
                                                        <li><b>pct_change :</b> ratio between the current value and the previous one</li>
                                                        <li><b>percent_rank :</b> percent rank</li>
                                                        <li><b>prod :</b> product</li>
                                                        <li><b>range :</b> difference between the max and the min</li>
                                                        <li><b>rank :</b> rank</li>
                                                        <li><b>row_number :</b> row number</li>
                                                        <li><b>sem :</b> standard error of the mean</li>
                                                        <li><b>skewness :</b> skewness</li>
                                                        <li><b>sum :</b> sum</li>
                                                        <li><b>std :</b> standard deviation</li>
                                                        <li><b>unique :</b> cardinality (count distinct)</li>
                                                        <li><b>var :</b> variance</li></ul>
                                                        Other analytical functions could work if it is part of the DB version you are using.</td> </tr>
    <tr> <td><div class="param_name">column</div></td> <td><div class="type">str</div></td> <td><div class = "yes">&#10003;</div></td> <td>Input vcolumn.</td> </tr>
     <tr> <td><div class="param_name">by</div></td> <td><div class="type">list</div></td> <td><div class = "yes">&#10003;</div></td> <td>vcolumns used in the partition.</td> </tr>
    <tr> <td><div class="param_name">order_by</div></td> <td><div class="type">dict / list</div></td> <td><div class = "yes">&#10003;</div></td> <td>List of the vcolumns used to sort the data using asc order or dictionary of all the sorting methods. For example, to sort by "column1" ASC and "column2" DESC, write {"column1": "asc", "column2": "desc"}</td> </tr>
    <tr> <td><div class="param_name">column2</div></td> <td><div class="type">str</div></td> <td><div class = "yes">&#10003;</div></td> <td>Second input vcolumn in case of functions using 2 parameters.</td> </tr>
    <tr> <td><div class="param_name">name</div></td> <td><div class="type">str</div></td> <td><div class = "yes">&#10003;</div></td> <td>Name of the new vcolumn. If empty a default name based on the other parameters will be generated.</td> </tr>
    <tr> <td><div class="param_name">offset</div></td> <td><div class="type">int</div></td> <td><div class = "yes">&#10003;</div></td> <td>Lead/Lag offset if parameter 'func' is the function 'lead'/'lag'.</td> </tr>
    <tr> <td><div class="param_name">x_smoothing</div></td> <td><div class="type">float</div></td> <td><div class = "yes">&#10003;</div></td> <td>The smoothing parameter of the 'ema' if the function is 'ema'. It must be in [0;1]</td> </tr>
    <tr> <td><div class="param_name">add_count</div></td> <td><div class="type">bool</div></td> <td><div class = "yes">&#10003;</div></td> <td>If the function is the 'mode' and this parameter is True then another column will be added to the vDataFrame with the mode number of occurences.</td> </tr>
    
</table>

### Returns

<b>vDataFrame</b> : self

### Example

In [8]:
from vertica_ml_python import vDataFrame
flights = vDataFrame("public.flights")
flights.eval(name = "week", expr = "WEEK(scheduled_departure)")
print(flights)

0,1,2,3,4,5,6,7
,departure_delay,origin_airport,scheduled_departure,airline,destination_airport,arrival_delay,week
0.0,14,DTW,2015-08-16 20:12:00,EV,ABE,5,34
1.0,29,DTW,2015-08-17 10:07:00,EV,ABE,27,34
2.0,19,ATL,2015-08-17 10:25:00,EV,ABE,10,34
3.0,4,ORD,2015-08-17 14:00:00,EV,ABE,61,34
4.0,-5,DTW,2015-08-17 14:12:00,EV,ABE,-17,34
,...,...,...,...,...,...,...


<object>  Name: flights, Number of rows: 4068736, Number of columns: 7


In [5]:
# LAG of departure_delay for the same flight (same airline and same 
# origin/destination airports)
flights.analytic(func = "lag",
                 column = "departure_delay",
                 by = ["origin_airport", "destination_airport", "airline"],
                 order_by = {"scheduled_departure": "asc"})

0,1,2,3,4,5,6,7,8
,departure_delay,origin_airport,scheduled_departure,airline,destination_airport,arrival_delay,week,lag_departure_delay__by_origin_airport_destination_airport_airline_order_by_scheduled_departure
0.0,-3,10397,2015-10-01 21:06:00,DL,10135,-14,40,
1.0,-2,10397,2015-10-02 21:06:00,DL,10135,-14,40,-3
2.0,-2,10397,2015-10-03 21:06:00,DL,10135,-16,40,-2
3.0,-7,10397,2015-10-04 21:06:00,DL,10135,-13,41,-2
4.0,1,10397,2015-10-05 21:06:00,DL,10135,1,41,-7
,...,...,...,...,...,...,...,...


<object>  Name: flights, Number of rows: 4068736, Number of columns: 8

In [7]:
# Airline having the biggest number of flights to manage in the week
flights.analytic(func = "mode",
                 column = "airline",
                 by = ["origin_airport", "week"],
                 add_count = True)

0,1,2,3,4,5,6,7,8,9
,departure_delay,origin_airport,scheduled_departure,airline,destination_airport,arrival_delay,week,mode_airline__by_origin_airport_week,mode_airline__by_origin_airport_week_count
0.0,-6,10135,2015-10-01 12:00:00,EV,10397,10,40,EV,13
1.0,-5,10135,2015-10-01 16:00:00,EV,10397,-1,40,EV,13
2.0,-3,10135,2015-10-02 12:00:00,EV,10397,-3,40,EV,13
3.0,-5,10135,2015-10-02 16:00:00,EV,10397,-5,40,EV,13
4.0,-9,10135,2015-10-03 14:00:00,EV,10397,-14,40,EV,13
,...,...,...,...,...,...,...,...,...


<object>  Name: flights, Number of rows: 4068736, Number of columns: 9

In [9]:
# Correlation between the arrival delay and departure delay for the 
# same origin and destination airports
flights.analytic(func = "corr",
                 column = "departure_delay",
                 column2 = "arrival_delay",
                 by = ["origin_airport", "destination_airport"])

0,1,2,3,4,5,6,7,8,9,10
,departure_delay,origin_airport,scheduled_departure,airline,destination_airport,arrival_delay,week,departure_delay_meanby_origin_airport_destination_airport,arrival_delay_meanby_origin_airport_destination_airport,corr_departure_delay_arrival_delay_by_origin_airport_destination_airport
0.0,-3,10397,2015-10-31 21:06:00,DL,10135,-28,44,5.54651162790698,5.54651162790698,0.933563722644589
1.0,-4,10397,2015-10-31 10:27:00,EV,10135,-13,44,5.54651162790698,5.54651162790698,0.933563722644589
2.0,-4,10397,2015-10-30 21:06:00,DL,10135,-14,44,5.54651162790698,5.54651162790698,0.933563722644589
3.0,-3,10397,2015-10-30 14:44:00,EV,10135,-6,44,5.54651162790698,5.54651162790698,0.933563722644589
4.0,29,10397,2015-10-30 10:27:00,EV,10135,24,44,5.54651162790698,5.54651162790698,0.933563722644589
,...,...,...,...,...,...,...,...,...,...


<object>  Name: flights, Number of rows: 4068736, Number of columns: 10

### See Also

<table id="seealso">
    <tr><td><a href="../eval">vDataFrame.eval</a></td> <td> Evaluates a customized expression.</td></tr>
    <tr><td><a href="../rolling">vDataFrame.rolling</a></td> <td> Computes a customized moving window.</td></tr>
</table>